shithub: openh264

Download patch

ref: 70e5e62f3dbd19f0e9300fa7bba670c7ee93dcd2
parent: bd509b2245d0c78e118cc67ee56c3fd5d5d3629b
author: Ethan Hugg <[email protected]>
date: Sun Dec 8 23:51:09 EST 2013

Initial Commit

--- /dev/null
+++ b/codec/WelsThreadLib/api/WelsThreadLib.h
@@ -1,0 +1,151 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	WelsThreadLib.h
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef   _WELS_THREAD_API_H_
+#define   _WELS_THREAD_API_H_
+
+#include "typedefs.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#if defined(WIN32)
+
+#include <windows.h>
+
+typedef    HANDLE                    WELS_THREAD_HANDLE;
+typedef    LPTHREAD_START_ROUTINE    LPWELS_THREAD_ROUTINE;
+
+typedef    CRITICAL_SECTION          WELS_MUTEX;
+typedef    HANDLE                    WELS_EVENT;
+
+#define    WELS_THREAD_ROUTINE_TYPE         DWORD  WINAPI
+#define    WELS_THREAD_ROUTINE_RETURN(rc)   return (DWORD)rc;
+
+#else	// NON-WINDOWS
+
+#if defined(__GNUC__) // LINUX, MACOS etc
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+typedef   pthread_t    WELS_THREAD_HANDLE;
+typedef  void* (*LPWELS_THREAD_ROUTINE)  ( void * );
+
+typedef   pthread_mutex_t           WELS_MUTEX;
+typedef   sem_t                     WELS_EVENT; 
+
+#define   WELS_THREAD_ROUTINE_TYPE         void * 
+#define   WELS_THREAD_ROUTINE_RETURN(rc)   return (void*)rc;
+
+#endif//__GNUC__
+
+#endif//WIN32
+
+typedef    int32_t        WELS_THREAD_ERROR_CODE;
+typedef    int32_t        WELS_THREAD_ATTR;
+
+typedef  struct _WelsLogicalProcessorInfo
+{
+	int32_t    ProcessorCount;
+} WelsLogicalProcessInfo;
+
+#define    WELS_THREAD_ERROR_OK					0
+#define    WELS_THREAD_ERROR_GENERIAL			((uint32_t)(-1))
+#define    WELS_THREAD_ERROR_WAIT_OBJECT_0		0
+#define	   WELS_THREAD_ERROR_WAIT_TIMEOUT		((uint32_t)0x00000102L)  
+#define	   WELS_THREAD_ERROR_WAIT_FAILED		WELS_THREAD_ERROR_GENERIAL
+
+void WelsSleep( uint32_t dwMilliseconds );
+WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex );
+WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex );
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex );
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex );
+
+#ifdef __GNUC__
+WELS_THREAD_ERROR_CODE    WelsEventOpen( WELS_EVENT **p_event, str_t *event_name );
+WELS_THREAD_ERROR_CODE    WelsEventClose( WELS_EVENT *event, str_t *event_name );
+#endif//__GNUC__
+WELS_THREAD_ERROR_CODE    WelsEventInit( WELS_EVENT *event );
+WELS_THREAD_ERROR_CODE    WelsEventDestroy( WELS_EVENT * event );
+WELS_THREAD_ERROR_CODE    WelsEventSignal( WELS_EVENT * event );
+WELS_THREAD_ERROR_CODE    WelsEventReset( WELS_EVENT * event );
+WELS_THREAD_ERROR_CODE    WelsEventWait( WELS_EVENT * event );
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut( WELS_EVENT * event, uint32_t dwMilliseconds );
+#ifdef WIN32
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking( uint32_t nCount, WELS_EVENT *event_list, uint32_t dwMilliseconds );
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking( uint32_t nCount, WELS_EVENT *event_list );
+#else
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking( uint32_t nCount, WELS_EVENT **event_list, uint32_t dwMilliseconds );
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking( uint32_t nCount, WELS_EVENT **event_list );
+#endif//WIN32
+
+WELS_THREAD_ERROR_CODE    WelsThreadCreate( WELS_THREAD_HANDLE * thread,  LPWELS_THREAD_ROUTINE  routine, 
+										   void * arg, WELS_THREAD_ATTR attr);
+
+WELS_THREAD_ERROR_CODE	  WelsSetThreadCancelable();
+
+WELS_THREAD_ERROR_CODE    WelsThreadJoin( WELS_THREAD_HANDLE  thread );
+
+WELS_THREAD_ERROR_CODE    WelsThreadCancel( WELS_THREAD_HANDLE  thread );
+
+WELS_THREAD_ERROR_CODE    WelsThreadDestroy( WELS_THREAD_HANDLE *thread );
+
+WELS_THREAD_HANDLE        WelsThreadSelf();
+
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo(WelsLogicalProcessInfo * pInfo);
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+++ b/codec/WelsThreadLib/src/WelsThreadLib.cpp
@@ -1,0 +1,567 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	WelsThreadLib.c
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+
+#include "WelsThreadLib.h"
+#include <stdio.h>
+
+#ifdef  WIN32
+
+void WelsSleep( uint32_t dwMilliseconds )
+{
+	Sleep( dwMilliseconds );
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex )
+{
+	InitializeCriticalSection(mutex);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex )
+{
+	EnterCriticalSection(mutex);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex )
+{
+	LeaveCriticalSection(mutex);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex )
+{
+    DeleteCriticalSection(mutex);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventInit( WELS_EVENT  *  event )
+{
+    WELS_EVENT   h = CreateEvent(NULL, FALSE, FALSE, NULL);
+
+	if( h == NULL ){
+		return WELS_THREAD_ERROR_GENERIAL;
+	}
+	*event = h;
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventSignal( WELS_EVENT * event )
+{
+	if( SetEvent( *event ) ){
+		return WELS_THREAD_ERROR_OK;
+	}
+	return WELS_THREAD_ERROR_GENERIAL;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventReset( WELS_EVENT * event )
+{
+	if ( ResetEvent( *event ) )
+		return WELS_THREAD_ERROR_OK;
+	return WELS_THREAD_ERROR_GENERIAL;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWait( WELS_EVENT * event )
+{
+	return WaitForSingleObject(*event, INFINITE );
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut( WELS_EVENT * event, uint32_t dwMilliseconds )
+{
+	return WaitForSingleObject(*event, dwMilliseconds );
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking(	uint32_t nCount,
+																WELS_EVENT *event_list,
+																uint32_t dwMilliseconds )
+{
+	return WaitForMultipleObjects( nCount, event_list, FALSE, dwMilliseconds );
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking( uint32_t nCount, WELS_EVENT *event_list )
+{
+	return WaitForMultipleObjects( nCount, event_list, TRUE, (uint32_t)-1 );
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventDestroy( WELS_EVENT * event )
+{
+	CloseHandle( *event );
+
+	*event = NULL;
+	return WELS_THREAD_ERROR_OK;
+}
+
+
+WELS_THREAD_ERROR_CODE    WelsThreadCreate( WELS_THREAD_HANDLE * thread,  LPWELS_THREAD_ROUTINE  routine, 
+										   void * arg, WELS_THREAD_ATTR attr)
+{
+    WELS_THREAD_HANDLE   h = CreateThread(NULL, 0, routine, arg, 0, NULL);
+
+	if( h == NULL ) {
+		return WELS_THREAD_ERROR_GENERIAL;
+	}
+	* thread = h;
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE	  WelsSetThreadCancelable()
+{
+	// nil implementation for WIN32
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadJoin( WELS_THREAD_HANDLE  thread )
+{
+    WaitForSingleObject(thread, INFINITE);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadCancel( WELS_THREAD_HANDLE  thread )
+{
+	return WELS_THREAD_ERROR_OK;
+}
+
+
+WELS_THREAD_ERROR_CODE    WelsThreadDestroy( WELS_THREAD_HANDLE *thread )
+{
+	if ( thread != NULL )
+	{
+		CloseHandle(*thread);
+		*thread = NULL;
+	}	
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_HANDLE        WelsThreadSelf()
+{
+	return GetCurrentThread();
+}
+
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo(WelsLogicalProcessInfo * pInfo)
+{
+	SYSTEM_INFO  si;	
+	
+	GetSystemInfo(&si);
+
+	pInfo->ProcessorCount = si.dwNumberOfProcessors;
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+#elif   defined(__GNUC__)
+
+#ifdef MACOS
+#include <CoreServices/CoreServices.h>
+//#include <Gestalt.h>
+#endif//MACOS
+
+static int32_t  SystemCall(const str_t * pCmd, str_t * pRes, int32_t iSize)
+{
+    int32_t fd[2];
+    int32_t iPid;
+    int32_t iCount;
+    int32_t left;
+    str_t * p = NULL;
+    int32_t iMaxLen = iSize - 1;
+    memset(pRes, 0, iSize);
+
+    if( pipe(fd) ){
+        return -1;
+    }
+
+    if( (iPid = fork()) == 0 ){
+        int32_t  fd2[2];
+        if( pipe(fd2) ){
+            return -1;
+        }
+        close(STDOUT_FILENO);
+        dup2(fd2[1],STDOUT_FILENO);
+        close(fd[0]);
+        close(fd2[1]);
+        system(pCmd);
+        read(fd2[0], pRes, iMaxLen);
+        write(fd[1], pRes, strlen(pRes));	// confirmed_safe_unsafe_usage
+        close(fd2[0]);
+		close(fd[1]);
+        exit(0);
+    }
+    close(fd[1]);
+    p = pRes;
+    left = iMaxLen;
+    while( (iCount = read(fd[0], p, left)) ){
+        p += iCount;
+        left -= iCount;
+        if( left <=0 ) break;   
+    }
+    close(fd[0]);
+    return 0;
+}
+
+void WelsSleep( uint32_t dwMilliseconds )
+{
+	usleep( dwMilliseconds * 1000 );	// microseconds
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadCreate( WELS_THREAD_HANDLE * thread,  LPWELS_THREAD_ROUTINE  routine, 
+										   void * arg, WELS_THREAD_ATTR attr)
+{
+	WELS_THREAD_ERROR_CODE err = 0;
+
+	pthread_attr_t at;
+	err = pthread_attr_init(&at);
+	if ( err )
+		return err;
+	err = pthread_attr_setscope(&at, PTHREAD_SCOPE_SYSTEM);
+	if ( err )
+		return err;
+	err = pthread_attr_setschedpolicy(&at, SCHED_FIFO);
+	if ( err )
+		return err;
+	err = pthread_create( thread, &at, routine, arg );
+
+	pthread_attr_destroy(&at);
+
+	return err;
+
+//	return pthread_create(thread, NULL, routine, arg); 
+}
+
+WELS_THREAD_ERROR_CODE	  WelsSetThreadCancelable()
+{
+	WELS_THREAD_ERROR_CODE err = pthread_setcancelstate( PTHREAD_CANCEL_ENABLE, NULL );
+	if ( 0 == err )
+		err = pthread_setcanceltype( PTHREAD_CANCEL_DEFERRED, NULL );
+	return err;
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadJoin( WELS_THREAD_HANDLE  thread )
+{
+    return pthread_join(thread, NULL);
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadCancel( WELS_THREAD_HANDLE  thread )
+{
+	return pthread_cancel( thread );
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadDestroy( WELS_THREAD_HANDLE *thread )
+{	
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_HANDLE        WelsThreadSelf()
+{
+	return pthread_self();
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex )
+{
+	return pthread_mutex_init(mutex, NULL);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex )
+{
+	return pthread_mutex_lock(mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex )
+{
+	return pthread_mutex_unlock(mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex )
+{
+    return pthread_mutex_destroy(mutex);
+}
+
+// unnamed semaphores can not work well for posix threading models under not root users
+
+WELS_THREAD_ERROR_CODE    WelsEventInit( WELS_EVENT *event )
+{
+	return sem_init(event, 0, 0);
+}
+
+WELS_THREAD_ERROR_CODE   WelsEventDestroy( WELS_EVENT * event )
+{
+	return sem_destroy( event );	// match with sem_init	
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventOpen( WELS_EVENT **p_event, str_t *event_name )
+{
+	if ( p_event == NULL || event_name == NULL )
+		return WELS_THREAD_ERROR_GENERIAL;
+	*p_event = sem_open(event_name, O_CREAT,  (S_IRUSR | S_IWUSR)/*0600*/, 0);
+	if ( *p_event == (sem_t *)SEM_FAILED ) {
+		sem_unlink( event_name );
+		*p_event = NULL;
+		return WELS_THREAD_ERROR_GENERIAL;
+	} else {		
+		return WELS_THREAD_ERROR_OK;
+	}
+}
+WELS_THREAD_ERROR_CODE    WelsEventClose( WELS_EVENT *event, str_t *event_name )
+{
+	WELS_THREAD_ERROR_CODE err = sem_close( event );	// match with sem_open
+	if ( event_name )
+		sem_unlink( event_name );
+	return err;
+}
+
+WELS_THREAD_ERROR_CODE   WelsEventSignal( WELS_EVENT * event )
+{
+	WELS_THREAD_ERROR_CODE err = 0;
+//	int32_t val = 0;
+//	sem_getvalue(event, &val);
+//	fprintf( stderr, "before signal it, val= %d..\n",val );
+	err = sem_post(event);
+//	sem_getvalue(event, &val);
+//	fprintf( stderr, "after signal it, val= %d..\n",val );
+    return err;
+}
+WELS_THREAD_ERROR_CODE    WelsEventReset( WELS_EVENT * event )
+{
+	// FIXME for posix event reset, seems not be supported for pthread??
+	sem_close(event);
+	return sem_init(event, 0, 0);
+}
+
+WELS_THREAD_ERROR_CODE   WelsEventWait( WELS_EVENT * event )
+{
+	return sem_wait(event);	// blocking until signaled
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut( WELS_EVENT * event, uint32_t dwMilliseconds )
+{	
+	if ( dwMilliseconds != (uint32_t)-1 )
+	{
+		return sem_wait(event);
+	}
+	else
+	{
+#if defined(MACOS)
+		int32_t err = 0;
+		int32_t wait_count = 0;
+		do{
+			err = sem_trywait(event);
+			if ( WELS_THREAD_ERROR_OK == err)
+				break;// WELS_THREAD_ERROR_OK;
+			else if ( wait_count > 0 )
+				break;
+			usleep( dwMilliseconds * 1000 );
+			++ wait_count;
+		}while(1);
+		return err;
+#else
+		struct timespec ts;
+		struct timeval tv;
+
+		gettimeofday(&tv,0);
+
+		ts.tv_sec = tv.tv_sec + dwMilliseconds /1000;
+		ts.tv_nsec = tv.tv_usec*1000 + (dwMilliseconds % 1000) * 1000000;
+
+		return sem_timedwait(event, &ts);
+#endif//MACOS
+	}
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking(	uint32_t nCount,
+																WELS_EVENT **event_list,
+																uint32_t dwMilliseconds )
+{
+	// bWaitAll = FALSE && blocking
+	uint32_t nIdx = 0;
+	const uint32_t kuiAccessTime = 2;	// 2 us once
+//	uint32_t uiSleepMs = 0;
+
+	if ( nCount == 0 )
+		return WELS_THREAD_ERROR_WAIT_FAILED;
+
+	while (1)
+	{
+		nIdx = 0;	// access each event by order
+		while ( nIdx < nCount )
+		{
+			int32_t err = 0;			
+//#if defined(MACOS)	// clock_gettime(CLOCK_REALTIME) & sem_timedwait not supported on mac, so have below impl
+			int32_t wait_count = 0;
+//			struct timespec ts;
+//			struct timeval tv;
+//			
+//			gettimeofday(&tv,0);
+//			ts.tv_sec = tv.tv_sec/*+ kuiAccessTime / 1000*/;		// second
+//			ts.tv_nsec = (tv.tv_usec + kuiAccessTime) * 1000;	// nano-second
+			
+			/*
+			 * although such interface is not used in __GNUC__ like platform, to use 
+			 * pthread_cond_timedwait() might be better choice if need
+			 */
+			do{
+				err = sem_trywait( event_list[nIdx] );
+				if ( WELS_THREAD_ERROR_OK == err )
+					return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
+				else if ( wait_count > 0 )
+					break;
+				usleep(kuiAccessTime);
+				++ wait_count;
+			}while( 1 );
+//#else
+//			struct timespec ts;
+//			
+//			if ( clock_gettime(CLOCK_REALTIME, &ts) == -1 )
+//				return WELS_THREAD_ERROR_WAIT_FAILED;
+//			ts.tv_nsec += kuiAccessTime/*(kuiAccessTime % 1000)*/ * 1000;
+//			
+////			fprintf( stderr, "sem_timedwait(): start to wait event %d..\n", nIdx );
+//			err = sem_timedwait(event_list[nIdx], &ts);
+////			if ( err == -1 )
+////			{
+////				sem_getvalue(&event_list[nIdx], &val);
+////				fprintf( stderr, "sem_timedwait() errno(%d) semaphore %d..\n", errno, val);
+////				return WELS_THREAD_ERROR_WAIT_FAILED;
+////			}			
+////			fprintf( stderr, "sem_timedwait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
+//			if ( WELS_THREAD_ERROR_OK == err ) // non-blocking mode
+//			{	
+////				int32_t val = 0;
+////				sem_getvalue(&event_list[nIdx], &val);
+////				fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
+////				fprintf( stderr, "WelsMultipleEventsWaitSingleBlocking sleep %d us\n", uiSleepMs);
+//				return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
+//			}
+//#endif					
+			// we do need access next event next time
+			++ nIdx;
+//			uiSleepMs += kuiAccessTime;
+		}
+		usleep( 1 );	// switch to working threads
+//		++ uiSleepMs;
+	}	
+
+	return WELS_THREAD_ERROR_WAIT_FAILED;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking( uint32_t nCount, WELS_EVENT **event_list )
+{
+	// bWaitAll = TRUE && blocking
+	uint32_t nIdx = 0;
+//	const uint32_t kuiAccessTime = (uint32_t)-1;// 1 ms once
+	uint32_t uiCountSignals = 0;
+	uint32_t uiSignalFlag	= 0;	// UGLY: suppose maximal event number up to 32
+	
+	if ( nCount == 0 || nCount > (sizeof(uint32_t)<<3) )
+		return WELS_THREAD_ERROR_WAIT_FAILED;
+	
+	while (1)
+	{
+		nIdx = 0;	// access each event by order
+		while (nIdx < nCount)
+		{			
+			const uint32_t kuiBitwiseFlag = (1<<nIdx);
+			
+			if ( (uiSignalFlag & kuiBitwiseFlag) != kuiBitwiseFlag ) // non-blocking mode
+			{	
+				int32_t err = 0;
+//				fprintf( stderr, "sem_wait(): start to wait event %d..\n", nIdx );
+				err = sem_wait(event_list[nIdx]);
+//				fprintf( stderr, "sem_wait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
+				if ( WELS_THREAD_ERROR_OK == err )
+				{
+//					int32_t val = 0;
+//					sem_getvalue(&event_list[nIdx], &val);
+//					fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
+
+					uiSignalFlag |= kuiBitwiseFlag;
+					++ uiCountSignals;
+					if ( uiCountSignals >= nCount )
+					{						
+						return WELS_THREAD_ERROR_OK;
+					}
+				}				
+			}			
+			// we do need access next event next time
+			++ nIdx;
+		}		
+	}	
+	
+	return WELS_THREAD_ERROR_WAIT_FAILED;
+}
+
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo(WelsLogicalProcessInfo * pInfo)
+{
+#ifdef LINUX
+
+#define   CMD_RES_SIZE    2048
+    str_t pBuf[CMD_RES_SIZE];
+   
+    SystemCall("cat /proc/cpuinfo | grep \"processor\" | wc -l", pBuf, CMD_RES_SIZE);
+
+    pInfo->ProcessorCount = atoi(pBuf);
+
+    if( pInfo->ProcessorCount == 0 ){
+        pInfo->ProcessorCount = 1;
+    }   
+ 
+	return WELS_THREAD_ERROR_OK;
+#undef   CMD_RES_SIZE
+
+#else
+
+	SInt32 cpunumber;
+	Gestalt(gestaltCountOfCPUs,&cpunumber);
+
+	pInfo->ProcessorCount	= cpunumber;
+
+	return WELS_THREAD_ERROR_OK;
+
+#endif//LINUX
+}
+
+#endif
+
+
+
--- /dev/null
+++ b/codec/api/svc/codec_api.h
@@ -1,0 +1,126 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_VIDEO_CODEC_SVC_API_H__
+#define WELS_VIDEO_CODEC_SVC_API_H__
+
+#include "codec_app_def.h"
+#include "codec_def.h"
+
+class ISVCEncoder
+{
+public:
+	/*
+	 * return: CM_RETURN: 0 - success; otherwise - failed;
+	 */
+	virtual int Initialize(SVCEncodingParam* pParam, const INIT_TYPE kiInitType = INIT_TYPE_PARAMETER_BASED) = 0;
+	virtual int Initialize(void* pParam, const INIT_TYPE kiInitType = INIT_TYPE_CONFIG_BASED) = 0;	
+	  
+	virtual int Unintialize() = 0;
+	
+	/*
+	 * return: EVideoFrameType [IDR: videoFrameTypeIDR; P: videoFrameTypeP; ERROR: videoFrameTypeInvalid]
+	 */
+	virtual int EncodeFrame(const unsigned char* kpSrc, SFrameBSInfo* pBsInfo) = 0;	
+	virtual int EncodeFrame(const SSourcePicture  ** kppSrcPicList, int nSrcPicNum, SFrameBSInfo * pBsInfo) = 0;
+	
+	/*
+	 * return: 0 - success; otherwise - failed;
+	 */
+	virtual int PauseFrame(const unsigned char* kpSrc, SFrameBSInfo* pBsInfo) = 0;	
+	
+	/*
+	 * return: 0 - success; otherwise - failed;
+	 */
+	virtual int ForceIntraFrame(bool bIDR) = 0;		
+	
+	/************************************************************************
+	 * InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
+	 ************************************************************************/
+	/*
+	 * return: CM_RETURN: 0 - success; otherwise - failed;
+	 */
+	virtual int SetOption(ENCODER_OPTION eOptionId, void* pOption) = 0;
+	virtual int GetOption(ENCODER_OPTION eOptionId, void* pOption) = 0;
+};
+
+class ISVCDecoder
+{
+public:
+	virtual long Initialize(void* pParam, const INIT_TYPE iInitType) = 0;
+	virtual long Unintialize() = 0;
+
+	virtual DECODING_STATE DecodeFrame(	const unsigned char* pSrc,
+		                                const int iSrcLen,	
+                                        unsigned char** ppDst,
+		                                int* pStride,
+		                                int& iWidth,
+		                                int& iHeight	) = 0;
+
+	/*
+	 *  src must be 4 byte aligned,   recommend 16 byte aligned.    the available src size must be multiple of 4.
+	 */
+	virtual DECODING_STATE DecodeFrame(	const unsigned char* pSrc,
+											const int iSrcLen,	
+											void ** ppDst,
+											SBufferInfo* pDstInfo) = 0;
+
+	/*
+	 *  src must be 4 byte aligned,   recommend 16 byte aligned.    the available src size must be multiple of 4.
+	 */
+	virtual DECODING_STATE DecodeFrameEx( const unsigned char * pSrc,
+		                                  const int iSrcLen,
+		                                  unsigned char * pDst,
+										  int iDstStride,
+		                                  int & iDstLen,
+		                                  int & iWidth,
+		                                  int & iHeight,
+		                                  int & iColorFormat) = 0;
+
+	/*************************************************************************
+	 * OutDataFormat
+	 *************************************************************************/
+	virtual long SetOption(DECODER_OPTION eOptionId, void* pOption) = 0;
+	virtual long GetOption(DECODER_OPTION eOptionId, void* pOption) = 0;
+};
+
+
+extern "C" 
+{
+int  CreateSVCEncoder(ISVCEncoder** ppEncoder);
+void DestroySVCEncoder(ISVCEncoder* pEncoder);
+
+long CreateDecoder(ISVCDecoder** ppDecoder);
+void DestroyDecoder(ISVCDecoder* pDecoder);
+}
+
+#endif//WELS_VIDEO_CODEC_SVC_API_H__
--- /dev/null
+++ b/codec/api/svc/codec_app_def.h
@@ -1,0 +1,292 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_VIDEO_CODEC_APPLICATION_DEFINITION_H__
+#define WELS_VIDEO_CODEC_APPLICATION_DEFINITION_H__
+
+////////////////Data and /or structures introduced in Cisco OpenH264 application////////////////
+
+/* Constants */
+#define MAX_TEMPORAL_LAYER_NUM		5
+#define MAX_SPATIAL_LAYER_NUM		4
+#define MAX_QUALITY_LAYER_NUM		4
+
+#define MAX_LAYER_NUM_OF_FRAME		128
+#define MAX_NAL_UNITS_IN_LAYER		128	// predetermined here, adjust it later if need
+
+#define MAX_RTP_PAYLOAD_LEN		1000
+#define AVERAGE_RTP_PAYLOAD_LEN		800
+
+
+#define SAVED_NALUNIT_NUM_TMP		( (MAX_SPATIAL_LAYER_NUM*MAX_QUALITY_LAYER_NUM) + 1 + MAX_SPATIAL_LAYER_NUM ) //SPS/PPS + SEI/SSEI + PADDING_NAL
+#define MAX_SLICES_NUM_TMP			( ( MAX_NAL_UNITS_IN_LAYER - SAVED_NALUNIT_NUM_TMP ) / 3 )
+
+typedef enum
+{
+	/* Errors derived from bitstream parsing */
+	dsErrorFree			= 0x00,	/* Bitstream error-free */
+	dsFramePending		= 0x01,	/* Need more throughput to generate a frame output,  */
+	dsRefLost			= 0x02,	/* layer lost at reference frame with temporal id 0  */
+	dsBitstreamError	= 0x04,	/* Error bitstreams(maybe broken internal frame) the decoder cared */
+	dsDepLayerLost		= 0x08,	/* Dependented layer is ever lost */
+	dsNoParamSets		= 0x10, /* No parameter set NALs involved */
+	
+	/* Errors derived from logic level */
+	dsInvalidArgument	= 0x1000,	/* Invalid argument specified */
+	dsInitialOptExpected= 0x2000,	/* Initializing operation is expected */
+	dsOutOfMemory		= 0x4000,	/* Out of memory due to new request */
+		/* ANY OTHERS? */
+	dsDstBufNeedExpand	= 0x8000	/* Actual picture size exceeds size of dst pBuffer feed in decoder, so need expand its size */
+	
+}DECODING_STATE;
+
+/* Option types introduced in SVC encoder application */
+typedef enum
+{
+	ENCODER_OPTION_DATAFORMAT = 0,
+	ENCODER_OPTION_IDR_INTERVAL,
+	ENCODER_OPTION_SVC_ENCODE_PARAM,
+	ENCODER_OPTION_FRAME_RATE,
+	ENCODER_OPTION_iBitRate,
+	ENCODER_OPTION_INTER_SPATIAL_PRED,
+	ENCODER_OPTION_RC_MODE,
+	ENCODER_PADDING_PADDING,
+
+	ENCODER_LTR_RECOVERY_REQUEST,
+	ENCODER_LTR_MARKING_FEEDBACK,
+	ENCOCER_LTR_MARKING_PERIOD,
+	ENCODER_OPTION_LTR,
+			
+	ENCODER_OPTION_ENABLE_SSEI,               //disable SSEI: true--disable ssei; false--enable ssei
+	ENCODER_OPTION_ENABLE_PREFIX_NAL_ADDING,   //enable prefix: true--enable prefix; false--disable prefix
+	ENCODER_OPTION_ENABLE_SPS_PPS_ID_ADDITION, //disable pSps/pPps id addition: true--disable pSps/pPps id; false--enable pSps/pPps id addistion
+
+	ENCODER_OPTION_CURRENT_PATH
+} ENCODER_OPTION;
+
+/* Option types introduced in SVC decoder application */
+typedef enum
+{
+	DECODER_OPTION_DATAFORMAT = 0,	/* Set color space of decoding output frame */
+	DECODER_OPTION_TRUNCATED_MODE,	/* Used in decoding bitstream of non integrated frame, only truncated working mode is supported by tune, so skip it */
+	DECODER_OPTION_END_OF_STREAM,	/* Indicate bitstream of the final frame to be decoded */
+	DECODER_OPTION_VCL_NAL,        //feedback whether or not have VCL NAL in current AU for application layer
+	DECODER_OPTION_TEMPORAL_ID,      //feedback temporal id for application layer
+	DECODER_OPTION_MODE,             // indicates the decoding mode
+	DECODER_OPTION_OUTPUT_PROPERTY,
+	DECODER_OPTION_FRAME_NUM,	//feedback current decoded frame number
+	DECODER_OPTION_IDR_PIC_ID,	// feedback current frame belong to which IDR period
+	DECODER_OPTION_LTR_MARKING_FLAG,	// feedback wether current frame mark a LTR
+	DECODER_OPTION_LTR_MARKED_FRAME_NUM,	// feedback frame num marked by current Frame
+	DECODER_OPTION_DEVICE_INFO,
+
+} DECODER_OPTION;
+typedef enum //feedback that whether or not have VCL NAL in current AU
+{
+	FEEDBACK_NON_VCL_NAL = 0,
+	FEEDBACK_VCL_NAL,
+	FEEDBACK_UNKNOWN_NAL	
+} FEEDBACK_VCL_NAL_IN_AU;
+typedef enum //feedback the iTemporalId in current AU if have VCL NAL
+{
+	FEEDBACK_TEMPORAL_ID_0 = 0,
+	FEEDBACK_TEMPORAL_ID_1,
+	FEEDBACK_TEMPORAL_ID_2,
+	FEEDBACK_TEMPORAL_ID_3,
+	FEEDBACK_TEMPORAL_ID_4,
+	FEEDBACK_UNKNOWN_TEMPORAL_ID	
+} FEEDBACK_TEMPORAL_ID;
+
+/* Type of layer being encoded */
+typedef enum
+{
+	NON_VIDEO_CODING_LAYER = 0,
+	    VIDEO_CODING_LAYER = 1
+} LAYER_TYPE;
+
+/* SVC Encoder/Decoder Initializing Parameter Types */
+typedef enum
+{
+	INIT_TYPE_PARAMETER_BASED = 0,	// For SVC DEMO Application
+	INIT_TYPE_CONFIG_BASED,			// For SVC CONSOLE Application
+}INIT_TYPE;
+
+//enumerate the type of video bitstream which is provided to decoder
+typedef enum
+{
+	VIDEO_BITSTREAM_AVC               = 0,	
+	VIDEO_BITSTREAM_SVC               = 1,
+	VIDEO_BITSTREAM_DEFAULT           = VIDEO_BITSTREAM_SVC,
+}VIDEO_BITSTREAM_TYPE;
+
+typedef enum
+{
+	NO_RECOVERY_REQUSET  = 0,
+	LTR_RECOVERY_REQUEST = 1,
+	IDR_RECOVERY_REQUEST = 2,
+	NO_LTR_MARKING_FEEDBACK =3,
+	LTR_MARKING_SUCCESS = 4,
+	LTR_MARKING_FAILED = 5,
+}KEY_FRAME_REQUEST_TYPE;
+
+typedef struct
+{
+	unsigned int uiFeedbackType; //IDR request or LTR recovery request
+	unsigned int uiIDRPicId; // distinguish request from different IDR
+	int		  iLastCorrectFrameNum;
+	int		  iCurrentFrameNum; //specify current decoder frame_num.
+}SLTRRecoverRequest;
+
+typedef struct
+{
+	unsigned int  uiFeedbackType; //mark failed or successful
+	unsigned int  uiIDRPicId; // distinguish request from different IDR
+	int			  iLTRFrameNum; //specify current decoder frame_num
+}SLTRMarkingFeedback;
+#pragma pack(1)
+
+typedef struct 
+{
+	
+	//# 0 SM_SINGLE_SLICE			| SliceNum==1
+	//# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+	//# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in SSliceArgument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+	//# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	|  Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+	//# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+	unsigned int uiSliceMode; //by default, uiSliceMode will be 0
+	struct {
+		unsigned int		uiSliceMbNum[MAX_SLICES_NUM_TMP];  //here we use a tmp fixed value since MAX_SLICES_NUM is not defined here and its definition may be changed; 
+		unsigned int		uiSliceNum;
+		unsigned int		uiSliceSizeConstraint;
+	} sSliceArgument;//not all the elements in this argument will be used, how it will be used depends on uiSliceMode; see below	
+} SSliceConfig;
+
+typedef struct {
+	int	iVideoWidth;		// video size in cx specified for a layer
+	int	iVideoHeight;		// video size in cy specified for a layer
+	float	fFrameRate;		// frame rate specified for a layer
+	int	iQualityLayerNum;	// layer number at quality level
+	int	iSpatialBitrate;	// target bitrate for a spatial layer
+	int	iCgsSnrRefined;	// 0: SNR layers all MGS; 1: SNR layers all CGS
+	int	iInterSpatialLayerPredFlag;	// 0: diabled [independency spatial layer coding]; 1: enabled [base spatial layer dependency coding]
+
+	int	iQualityBitrate[MAX_QUALITY_LAYER_NUM];	// target bitrate for a quality layer
+	
+	SSliceConfig sSliceCfg;
+} SSpatialLayerConfig;
+
+/* SVC Encoding Parameters */
+typedef struct {
+	int		iPicWidth;			// width of picture in samples
+	int		iPicHeight;			// height of picture in samples
+	int		iTargetBitrate;		// target bitrate desired
+	int		iTemporalLayerNum;	// layer number at temporal level
+	int		iSpatialLayerNum;	// layer number at spatial level
+
+	float	fFrameRate;			// input maximal frame rate
+	
+	int		iInputCsp;			// color space of input sequence
+	int		iKeyPicCodingMode;// mode of key picture coding
+	int		iIntraPeriod;		// period of Intra frame
+	bool    bEnableSpsPpsIdAddition;
+	bool    bPrefixNalAddingCtrl;
+	bool   	bEnableDenoise;	    // denoise control
+	bool    bEnableBackgroundDetection; 	// background detection control //VAA_BACKGROUND_DETECTION //BGD cmd
+	bool    bEnableAdaptiveQuant; // adaptive quantization control
+	bool	bEnableCropPic;	// enable cropping source picture.  8/25/2010
+								// FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
+	bool     bEnableLongTermReference; // 0: on, 1: off
+	int     iLtrMarkPeriod;
+
+	int iRCMode;                 // RC mode
+	int	iTemporalBitrate[MAX_TEMPORAL_LAYER_NUM];	// target bitrate specified for a temporal level
+	int iPaddingFlag;            // 0:disable padding;1:padding	
+
+	SSpatialLayerConfig sSpatialLayers[MAX_SPATIAL_LAYER_NUM];
+	
+} SVCEncodingParam, *PSVCEncodingParam;
+
+//Define a new struct to show the property of video bitstream.
+typedef struct {
+	unsigned int          size; //size of the struct
+	VIDEO_BITSTREAM_TYPE  eVideoBsType;
+} SVideoProperty;
+
+/* SVC Decoding Parameters, reserved here and potential applicable in the future */
+typedef struct TagSVCDecodingParam{
+	char		*pFileNameRestructed;	// File name of restructed frame used for PSNR calculation based debug
+	
+	int				iOutputColorFormat;	// color space format to be outputed, EVideoFormatType specified in codec_def.h
+	unsigned int	uiCpuLoad;		// CPU load
+	unsigned char	uiTargetDqLayer;	// Setting target dq layer id
+
+	unsigned char	uiEcActiveFlag;		// Whether active error concealment feature in decoder
+
+	SVideoProperty   sVideoProperty;
+} SDecodingParam, *PDecodingParam;
+
+/* Bitstream inforamtion of a layer being encoded */
+typedef struct {
+	unsigned char uiTemporalId;
+	unsigned char uiSpatialId;
+	unsigned char uiQualityId;
+
+	unsigned char uiPriorityId; //ignore it currently
+
+	unsigned char uiLayerType;
+
+	int	iNalCount;					// Count number of NAL coded already
+	int	iNalLengthInByte[MAX_NAL_UNITS_IN_LAYER];	// Length of NAL size in byte from 0 to iNalCount-1
+	unsigned char*	pBsBuf;		// Buffer of bitstream contained
+} SLayerBSInfo, *PLayerBSInfo;
+
+
+typedef struct {
+	int		iTemporalId;	// Temporal ID
+	unsigned char	uiFrameType;
+
+	int		iLayerNum;
+	SLayerBSInfo	sLayerInfo[MAX_LAYER_NUM_OF_FRAME];
+
+} SFrameBSInfo, *PFrameBSInfo;
+
+typedef struct Source_Picture_s {	
+	int		    iColorFormat;	// color space type
+	int  		iStride[4];		// stride for each plane pData
+	unsigned char  *pData[4];		// plane pData
+	int  		iPicWidth;				// luma picture width in x coordinate
+	int 		iPicHeight;				// luma picture height in y coordinate
+} SSourcePicture;
+
+
+#pragma pack()
+#endif//WELS_VIDEO_CODEC_APPLICATION_DEFINITION_H__
--- /dev/null
+++ b/codec/api/svc/codec_def.h
@@ -1,0 +1,252 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_VIDEO_CODEC_DEFINITION_H__
+#define WELS_VIDEO_CODEC_DEFINITION_H__
+
+#if defined(WIN32)
+#pragma once
+#endif//WIN32
+
+typedef enum
+{
+	/*rgb color formats*/
+	videoFormatRGB        = 1,
+	videoFormatRGBA       = 2, 
+	videoFormatRGB555     = 3,
+	videoFormatRGB565     = 4,
+	videoFormatBGR        = 5,
+	videoFormatBGRA       = 6,
+	videoFormatABGR       = 7,
+	videoFormatARGB       = 8,
+
+	/*yuv color formats*/
+	videoFormatYUY2       = 20,
+	videoFormatYVYU       = 21,
+	videoFormatUYVY       = 22,
+	videoFormatI420       = 23,                        //same as IYUV
+	videoFormatYV12       = 24,
+	videoFormatInternal   = 25,                        // Only Used for SVC decoder testbed
+	
+	videoFormatNV12		  = 26,						// new format for output by DXVA decoding
+	
+	videoFormatVFlip      = 0x80000000
+}EVideoFormatType;
+
+typedef enum
+{
+	videoFrameTypeInvalid,		/* Encoder not ready or parameters are invalidate */
+	videoFrameTypeIDR,		/* This type is only available for H264 if this frame is key frame, then return this type */
+	videoFrameTypeI,		/* I frame type */
+	videoFrameTypeP,		/* P frame type */
+	videoFrameTypeSkip,		/* Skip the frame based encoder kernel */
+	videoFrameTypeIPMixed,		/* Frame type introduced I and P slices are mixing */
+}EVideoFrameType;
+
+typedef enum
+{
+	cmResultSuccess,
+	cmInitParaError,                  /*Parameters are invalid */
+	cmMachPerfIsBad,                  /*The performance of machine is not enough to support 
+									    H264 CODEC, in this case, suggestion user use h263 
+										or set fps to low like 5fps or more low*/
+	cmUnkonwReason,
+	cmMallocMemeError,                /*Malloc a memory error*/
+	cmInitExpected,			  /*Initial action is expected*/
+}CM_RETURN;
+
+
+/* nal unit type */
+enum ENalUnitType
+{
+    NAL_UNKNOWN = 0,
+	NAL_SLICE   = 1,
+	NAL_SLICE_DPA   = 2,
+	NAL_SLICE_DPB   = 3,
+	NAL_SLICE_DPC   = 4,
+	NAL_SLICE_IDR   = 5,    /* ref_idc != 0 */
+	NAL_SEI         = 6,    /* ref_idc == 0 */
+	NAL_SPS         = 7,
+	NAL_PPS         = 8
+	/* ref_idc == 0 for 6,9,10,11,12 */
+};
+/* NRI: eNalRefIdc */
+enum ENalPriority
+{
+    NAL_PRIORITY_DISPOSABLE = 0,
+	NAL_PRIORITY_LOW        = 1,
+	NAL_PRIORITY_HIGH       = 2,
+	NAL_PRIORITY_HIGHEST    = 3,
+};
+
+#define IS_PARAMETER_SET_NAL(eNalRefIdc, eNalType) \
+( (eNalRefIdc == NAL_PRIORITY_HIGHEST) && (eNalType == (NAL_SPS|NAL_PPS) || eNalType == NAL_SPS) )
+
+#define IS_IDR_NAL(eNalRefIdc, eNalType) \
+( (eNalRefIdc == NAL_PRIORITY_HIGHEST) && (eNalType == NAL_SLICE_IDR) )
+
+#define FRAME_NUM_PARAM_SET		(-1)
+#define FRAME_NUM_IDR			0
+
+#pragma pack(1)
+
+/* Error Tools definition */
+typedef unsigned short ERR_TOOL;
+enum{
+	ET_NONE = 0x00,					// NONE Error Tools
+	ET_IP_SCALE = 0x01,				// IP Scalable
+	ET_FMO = 0x02,					// Flexible Macroblock Ordering
+	ET_IR_R1 = 0x04,				// Intra Refresh in predifined 2% MB
+	ET_IR_R2 = 0x08,				// Intra Refresh in predifined 5% MB
+	ET_IR_R3 = 0x10,				// Intra Refresh in predifined 10% MB
+	ET_FEC_HALF = 0x20,				// Forward Error Correction in 50% redundency mode
+	ET_FEC_FULL	= 0x40,				// Forward Error Correction in 100% redundency mode
+	ET_RFS = 0x80,					// Reference Frame Selection
+};
+
+/* information of coded Slice(=NAL)(s) */
+typedef struct SliceInformation
+{
+	unsigned char*	pBufferOfSlices;		// base buffer of coded slice(s)
+	int				iCodedSliceCount;	// number of coded slices
+	unsigned int*	pLengthOfSlices;		// array of slices length accordingly by number of slice
+	int				iFecType;			// FEC type[0, 50%FEC, 100%FEC]
+	unsigned char	uiSliceIdx;		// index of slice in frame [FMO: 0,..,uiSliceCount-1; No FMO: 0] 
+	unsigned char	uiSliceCount;		// count number of slice in frame [FMO: 2-8; No FMO: 1]
+	char			iFrameIndex;		// index of frame[-1, .., idr_interval-1]
+	unsigned char	uiNalRefIdc;		// NRI, priority level of slice(NAL)
+	unsigned char	uiNalType;			// NAL type
+	unsigned char	uiContainingFinalNal;	// whether final NAL is involved in buffer of coded slices, flag used in Pause feature in T27
+} SliceInfo, *PSliceInfo;
+
+
+
+#define CIF_WIDTH		352
+#define CIF_HEIGHT		288
+#define QVGA_WIDTH		320
+#define QVGA_HEIGHT		240
+#define QCIF_WIDTH		176
+#define QCIF_HEIGHT		144
+#define SQCIF_WIDTH		128
+#define SQCIF_HEIGHT	96
+
+/* thresholds of the initial, maximal and minimal rate */
+typedef struct {
+	int	iWidth;			// frame width
+	int	iHeight;			// frame height
+	int	iThresholdOfInitRate;	// threshold of initial rate
+	int	iThresholdOfMaxRate;	// threshold of maximal rate
+	int	iThresholdOfMinRate;	// threshold of minimal rate
+	int iMinThresholdFrameRate;		//min frame rate min
+	int	iSkipFrameRate;	//skip to frame rate min
+	int iSkipFrameStep;	//how many frames to skip
+}SRateThresholds, *PRateThresholds;
+
+/*new interface*/
+typedef struct WelsDeviceInfo
+{
+	int  bSupport;          /* a logic flag provided by decoder which indicates whether GPU decoder can work based on the following device info. */
+	char Vendor[128];   // vendor name
+	char Device[128];    // device name
+	char Driver[128];     // driver version
+	char DriverDate[128]; //  driver release date 
+} Device_Info;
+
+typedef enum TagBufferProperty
+{
+	BUFFER_HOST	   = 0,   // host memory
+	BUFFER_DEVICE  = 1,	  // device memory including surface and shared handle
+						  // for DXVA: shared handle
+						  // for VDA : iosurface
+						
+	//SURFACE_DEVICE ,	 // surface
+	//SHARED_HANDLE      // shared handle
+}EBufferProperty;
+
+typedef enum TagDecodeMode
+{
+	AUTO_MODE = 0,   // decided by decoder itself, dynamic mode switch, delayed switch
+	SW_MODE = 1,		// decoded by CPU, instant switch
+	GPU_MODE = 2,	// decoded by GPU, instant switch 
+	SWITCH_MODE =3	// switch to the other mode, forced mode switch, delayed switch
+}EDecodeMode;
+
+typedef struct TagSysMemBuffer
+{	
+	int	iWidth;			//width of decoded pic for display
+	int iHeight;			//height of decoded pic for display
+	int iFormat; 		// type is "EVideoFormatType"
+	int iStride[2];		//stride of 2 component	
+}SSysMEMBuffer;
+
+typedef struct TagVideoMemBuffer
+{
+	int iSurfaceWidth;   // used for surface create
+	int iSurfaceHeight;
+	int D3Dformat;  //type is "D3DFORMAT"
+  	int D3DPool; // type is "D3DPOOL";
+	int iLeftTopX;
+	int iLeftTopY;
+	int iRightBottomX;
+	int iRightBottomY;
+}SVideoMemBuffer;
+
+typedef struct TagBufferInfo
+{
+	EBufferProperty eBufferProperty;	//0: host memory; 1: device memory;
+	int iBufferStatus;  // 0: one frame data is not ready; 1: one frame data is ready
+	EDecodeMode eWorkMode;				//indicate what the real working mode in decoder
+	union {
+		SSysMEMBuffer sSystemBuffer;
+		SVideoMemBuffer sVideoBuffer;
+	}UsrData;	
+}SBufferInfo;
+
+/* Constants related to transmission rate at various resolutions */
+static const SRateThresholds ksRateThrMap[4] = {
+	// initial-maximal-minimal
+	{CIF_WIDTH, CIF_HEIGHT, 225000, 384000, 96000, 3, 1, 1},		// CIF
+	{QVGA_WIDTH, QVGA_HEIGHT, 192000, 320000, 80000, -1, -1, -1},	// QVGA
+	{QCIF_WIDTH, QCIF_HEIGHT, 150000, 256000, 64000, 8, 4, 2},		// QCIF
+	{SQCIF_WIDTH, SQCIF_HEIGHT, 120000, 192000, 48000, 5, 3, 1}	// SQCIF
+};
+
+
+// In a GOP, multiple of the key frame number, derived from 
+// the number of layers(index or array below)
+static const char kiKeyNumMultiple[] = {
+	1, 1, 2, 4, 8, 16,
+};
+
+#pragma pack()
+
+#endif//WELS_VIDEO_CODEC_DEFINITION_H__
--- /dev/null
+++ b/codec/build/linux/dec/makefile
@@ -1,0 +1,246 @@
+NAME= welsdec
+
+### include debug information: 1=yes, 0=no
+DBG= 0
+NASM = 1
+DEPEND= dependencies
+
+BINDIR= 	../bin
+OUTDIR= 	../../../../bin/linux
+INCLUDE= 	-I../../../api/svc -I../../../decoder/core/inc -I../../../decoder/plus/inc -I../../../console/dec/inc
+CORESRCDIR=	../../../decoder/core/src
+PLUSSRCDIR=	../../../decoder/plus/src
+ASMSRCDIR=	../../../decoder/core/asm
+MAINSRCDIR=	../../../console/dec/src
+
+OBJMAINDIR= ../obj
+OBJDIR= ../obj/dec
+
+CC= $(shell which gcc)
+AS= $(shell which nasm)
+CXX = g++ -m32
+GCC = gcc -m32
+
+ASFLAGS= -f elf -DNOPREFIX -I ../../../decoder/core/asm/
+
+LIBS= -lstdc++ -ldl
+#-lm 
+CFLAGS=  $(INCLUDE) -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DHAVE_CACHE_LINE_ALIGN
+
+ifeq ($(DBG),1)
+#SUFFIX= .dbg
+CFLAGS+= -g
+else
+#SUFFIX=
+CFLAGS+= -O3
+endif
+
+ifeq ($(NASM), 1)
+CFLAGS += -DX86_ASM
+endif
+
+OBJSUF= .o$(SUFFIX)
+
+DECODESRC=$(CORESRCDIR)/au_parser.cpp \
+$(CORESRCDIR)/bit_stream.cpp \
+$(CORESRCDIR)/cpu.cpp \
+$(CORESRCDIR)/deblocking.cpp \
+$(CORESRCDIR)/decode_mb_aux.cpp \
+$(CORESRCDIR)/decoder.cpp \
+$(CORESRCDIR)/decoder_data_tables.cpp \
+$(CORESRCDIR)/expand_pic.cpp \
+$(CORESRCDIR)/fmo.cpp \
+$(CORESRCDIR)/get_intra_predictor.cpp \
+$(CORESRCDIR)/manage_dec_ref.cpp \
+$(CORESRCDIR)/mc.cpp \
+$(CORESRCDIR)/mem_align.cpp \
+$(CORESRCDIR)/memmgr_nal_unit.cpp \
+$(CORESRCDIR)/mv_pred.cpp \
+$(CORESRCDIR)/parse_mb_syn_cavlc.cpp \
+$(CORESRCDIR)/pic_queue.cpp \
+$(CORESRCDIR)/rec_mb.cpp \
+$(CORESRCDIR)/decode_slice.cpp \
+$(CORESRCDIR)/decoder_core.cpp \
+$(CORESRCDIR)/utils.cpp \
+$(PLUSSRCDIR)/welsDecoderExt.cpp \
+$(PLUSSRCDIR)/welsCodecTrace.cpp
+
+ASMSRC= $(ASMSRCDIR)/block_add.asm \
+$(ASMSRCDIR)/cpuid.asm \
+$(ASMSRCDIR)/deblock.asm \
+$(ASMSRCDIR)/expand_picture.asm \
+$(ASMSRCDIR)/dct.asm \
+$(ASMSRCDIR)/intra_pred.asm \
+$(ASMSRCDIR)/mc_chroma.asm \
+$(ASMSRCDIR)/mb_copy.asm \
+$(ASMSRCDIR)/mc_luma.asm \
+$(ASMSRCDIR)/memzero.asm \
+$(ASMSRCDIR)/asm_inc.asm \
+ 
+MAINSRC= $(MAINSRCDIR)/d3d9_utils.cpp \
+$(MAINSRCDIR)/h264dec.cpp \
+$(MAINSRCDIR)/read_config.cpp
+
+OBJDEC=$(OBJDIR)/au_parser.o \
+$(OBJDIR)/bit_stream.o \
+$(OBJDIR)/cpu.o \
+$(OBJDIR)/deblocking.o \
+$(OBJDIR)/decode_mb_aux.o \
+$(OBJDIR)/decoder.o \
+$(OBJDIR)/decoder_data_tables.o \
+$(OBJDIR)/expand_pic.o \
+$(OBJDIR)/fmo.o \
+$(OBJDIR)/get_intra_predictor.o \
+$(OBJDIR)/manage_dec_ref.o \
+$(OBJDIR)/mc.o \
+$(OBJDIR)/mem_align.o \
+$(OBJDIR)/memmgr_nal_unit.o \
+$(OBJDIR)/mv_pred.o \
+$(OBJDIR)/parse_mb_syn_cavlc.o \
+$(OBJDIR)/pic_queue.o \
+$(OBJDIR)/rec_mb.o \
+$(OBJDIR)/decode_slice.o \
+$(OBJDIR)/decoder_core.o \
+$(OBJDIR)/utils.o \
+$(OBJDIR)/welsDecoderExt.o \
+$(OBJDIR)/welsCodecTrace.o
+
+ifeq ($(NASM), 1)
+OBJDEC+=$(OBJDIR)/block_add.o \
+$(OBJDIR)/cpuid.o \
+$(OBJDIR)/deblock.o \
+$(OBJDIR)/expand_picture.o \
+$(OBJDIR)/dct.o \
+$(OBJDIR)/intra_pred.o \
+$(OBJDIR)/mc_chroma.o \
+$(OBJDIR)/mb_copy.o \
+$(OBJDIR)/mc_luma.o \
+$(OBJDIR)/memzero.o \
+$(OBJDIR)/asm_inc.o 
+endif
+
+OBJBIN=	$(OBJDIR)/d3d9_utils.o \
+$(OBJDIR)/h264dec.o \
+$(OBJDIR)/read_config.o
+
+BINLIB=    	$(BINDIR)/$(NAME).a
+SHAREDLIB=  	$(BINDIR)/$(NAME).so
+BIN=    	$(BINDIR)/$(NAME).exe
+
+default: depend checkdir lib dylib exe release
+
+dependencies:
+	@echo "" >dependencies
+	
+checkdir:
+	@echo 'checkdir..'
+	@if test ! -d $(BINDIR) ; \
+	then \
+		mkdir -p $(BINDIR) ; \
+	fi
+	@if test ! -d $(OUTDIR) ; \
+	then \
+		mkdir -p $(OUTDIR) ; \
+	fi
+	@if test ! -d $(OBJMAINDIR) ; \
+	then \
+		mkdir -p $(OBJMAINDIR) ; \
+	fi
+	@if test ! -d $(OBJDIR) ; \
+	then \
+		mkdir -p $(OBJDIR) ; \
+	fi
+	@echo
+	
+release:
+	@echo 'release..'
+	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
+	@cp -f $(SHAREDLIB) $(OUTDIR)
+	@echo 'cp -f $(BIN) $(OUTDIR)'
+	@cp -f $(BIN) $(OUTDIR)
+	@echo
+
+clean:
+	@echo remove all objects
+	@rm -f $(OBJDEC)
+	@rm -f $(OBJBIN)
+	@rm -f $(BINLIB)
+	@rm -f $(SHAREDLIB)
+	@rm -f $(BIN)    
+
+tags:
+	@echo update tag table
+	@etags $(CORESRCDIR)/*.c $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
+	
+	
+lib:   	$(OBJDEC) 
+	@echo '$(OBJDEC)'
+	@echo
+	@echo 'ar cr $(BINLIB) $(OBJDEC)'
+	@echo
+	@echo 'creating libraries "$(BINLIB)"'
+	@ar cr $(BINLIB) $(OBJDEC)
+	@echo '... done'
+	#@echo 'cp $(BINLIB) /usr/lib'
+	#@cp $(BINLIB) /usr/lib
+	@echo
+
+
+dylib:   $(OBJDEC)
+	@echo '$(OBJDEC)'
+	@echo
+	@echo '$(CXX) -shared -Wl,-Bsymbolic -o $(SHAREDLIB) $(OBJDEC) $(LIBS)'
+	@echo 'creating dynamic library "$(SHAREDLIB)"'
+	@$(CXX)  -shared -Wl,-Bsymbolic -o $(SHAREDLIB) $(OBJDEC)  $(LIBS)
+	@echo '... done'
+	@echo
+	
+
+exe:	$(OBJBIN)
+	@echo	
+	@echo '$(OBJBIN)'
+	@echo
+	@echo '$(CXX) $(LIBS) $(OBJBIN) $(BINLIB) -o $(BIN)'
+	@echo 'creating binary "$(BIN)"'
+	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS) 
+	@echo '... done'
+	@echo
+
+depend:
+	@echo
+	@echo 'checking dependencies'
+	@$(SHELL) -ec '$(CC) -m32 -MM $(CFLAGS) $(DECODESRC) $(ASMSRC) $(MAINSRC)\
+         | sed '\''s@\(.*\)\.o[ :]@$(OBJDIR)/\1.o$(SUFFIX):@g'\''               \
+         >$(DEPEND)'
+	@echo
+
+#$(OBJDIR)/%.o$(SUFFIX): $(COMMSRCDIR)/%.c
+#	@echo 'compiling object file "$@" ...'
+#	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
+$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.c
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+
+$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+		
+$(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	
+$(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
+	@echo 'compiling object file "$@" ...'
+	@$(AS) $(ASFLAGS) -o $@ $<	
+
+#$(OBJDIR)/%.o$(SUFFIX): $(ASMCOMDIR)/%.asm
+#	@echo 'compiling object file "$@" ...'
+#	@$(AS) $(ASFLAGS) -o $@ $<
+	
+$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	
+include $(DEPEND)
+
--- /dev/null
+++ b/codec/build/linux/enc/makefile
@@ -1,0 +1,270 @@
+NAME= welsenc
+
+### include debug information: 1=yes, 0=no
+DBG= 0
+NASM = 1
+DEPEND= dependencies
+
+OUTDIR=		../../../../bin/linux
+BINDIR= 	../bin
+INCLUDE=  -I../../../encoder/core/inc -I../../../encoder/plus/inc -I../../../api/svc -I../../../WelsThreadLib/api -I../../../console/enc/inc
+THREADLIBSRCDIR=../../../WelsThreadLib/src
+CORESRCDIR=	../../../encoder/core/src
+PLUSSRCDIR=	../../../encoder/plus/src
+ASMSRCDIR=	../../../encoder/core/asm
+MAINSRCDIR=	../../../console/enc/src
+
+OBJMAINDIR= ../obj
+OBJDIR= ../obj/enc
+
+CC= $(shell which gcc)
+AS= $(shell which nasm)
+CXX = g++ -m32
+GCC = gcc -m32
+
+ASFLAGS= -f elf -DNOPREFIX -I ../../../encoder/core/asm/
+
+LIBS= -lstdc++ -ldl -lpthread
+#-lm 
+CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED 
+
+ifeq ($(DBG),1)
+#SUFFIX= .dbg
+CFLAGS+= -g
+else
+#SUFFIX=
+CFLAGS+= -O3
+endif
+ifeq ($(NASM), 1)
+CFLAGS += -DX86_ASM
+endif
+OBJSUF= .o$(SUFFIX)
+
+ENCODESRC= $(CORESRCDIR)/wels_preprocess.cpp \
+$(CORESRCDIR)/au_set.cpp \
+$(CORESRCDIR)/cpu.cpp \
+$(CORESRCDIR)/deblocking.cpp \
+$(CORESRCDIR)/decode_mb_aux.cpp \
+$(CORESRCDIR)/encode_mb_aux.cpp \
+$(CORESRCDIR)/encoder.cpp \
+$(CORESRCDIR)/encoder_data_tables.cpp \
+$(CORESRCDIR)/encoder_ext.cpp \
+$(CORESRCDIR)/expand_pic.cpp \
+$(CORESRCDIR)/get_intra_predictor.cpp \
+$(CORESRCDIR)/mc.cpp \
+$(CORESRCDIR)/md.cpp \
+$(CORESRCDIR)/memory_align.cpp \
+$(CORESRCDIR)/mv_pred.cpp \
+$(CORESRCDIR)/nal_encap.cpp \
+$(CORESRCDIR)/picture_handle.cpp \
+$(CORESRCDIR)/property.cpp \
+$(CORESRCDIR)/ratectl.cpp \
+$(CORESRCDIR)/ref_list_mgr_svc.cpp \
+$(CORESRCDIR)/sample.cpp \
+$(CORESRCDIR)/set_mb_syn_cavlc.cpp \
+$(CORESRCDIR)/slice_multi_threading.cpp \
+$(CORESRCDIR)/svc_enc_slice_segment.cpp \
+$(CORESRCDIR)/svc_base_layer_md.cpp \
+$(CORESRCDIR)/svc_encode_mb.cpp \
+$(CORESRCDIR)/svc_encode_slice.cpp \
+$(CORESRCDIR)/svc_mode_decision.cpp \
+$(CORESRCDIR)/svc_motion_estimate.cpp \
+$(CORESRCDIR)/svc_set_mb_syn_cavlc.cpp \
+$(CORESRCDIR)/utils.cpp \
+$(THREADLIBSRCDIR)/WelsThreadLib.cpp \
+$(PLUSSRCDIR)/welsEncoderExt.cpp \
+$(PLUSSRCDIR)/welsCodecTrace.cpp
+
+ASMSRC=	$(ASMSRCDIR)/coeff.asm \
+$(ASMSRCDIR)/cpuid.asm \
+$(ASMSRCDIR)/dct.asm \
+$(ASMSRCDIR)/deblock.asm \
+$(ASMSRCDIR)/expand_picture.asm \
+$(ASMSRCDIR)/intra_pred.asm \
+$(ASMSRCDIR)/intra_pred_util.asm \
+$(ASMSRCDIR)/mb_copy.asm \
+$(ASMSRCDIR)/mc_chroma.asm \
+$(ASMSRCDIR)/mc_luma.asm \
+$(ASMSRCDIR)/memzero.asm \
+$(ASMSRCDIR)/quant.asm \
+$(ASMSRCDIR)/satd_sad.asm \
+$(ASMSRCDIR)/score.asm \
+$(ASMSRCDIR)/asm_inc.asm \
+$(ASMSRCDIR)/vaa.asm
+
+
+MAINSRC= $(MAINSRCDIR)/read_config.cpp \
+$(MAINSRCDIR)/welsenc.cpp
+
+OBJENC=	$(OBJDIR)/wels_preprocess.o \
+$(OBJDIR)/au_set.o \
+$(OBJDIR)/cpu.o \
+$(OBJDIR)/deblocking.o \
+$(OBJDIR)/decode_mb_aux.o \
+$(OBJDIR)/encode_mb_aux.o \
+$(OBJDIR)/encoder.o \
+$(OBJDIR)/encoder_data_tables.o \
+$(OBJDIR)/encoder_ext.o \
+$(OBJDIR)/expand_pic.o \
+$(OBJDIR)/get_intra_predictor.o \
+$(OBJDIR)/mc.o \
+$(OBJDIR)/md.o \
+$(OBJDIR)/memory_align.o \
+$(OBJDIR)/mv_pred.o \
+$(OBJDIR)/nal_encap.o \
+$(OBJDIR)/picture_handle.o \
+$(OBJDIR)/property.o \
+$(OBJDIR)/ratectl.o \
+$(OBJDIR)/ref_list_mgr_svc.o \
+$(OBJDIR)/sample.o \
+$(OBJDIR)/set_mb_syn_cavlc.o \
+$(OBJDIR)/slice_multi_threading.o \
+$(OBJDIR)/svc_enc_slice_segment.o \
+$(OBJDIR)/svc_base_layer_md.o \
+$(OBJDIR)/svc_encode_mb.o \
+$(OBJDIR)/svc_encode_slice.o \
+$(OBJDIR)/svc_mode_decision.o \
+$(OBJDIR)/svc_motion_estimate.o \
+$(OBJDIR)/svc_set_mb_syn_cavlc.o \
+$(OBJDIR)/utils.o \
+$(OBJDIR)/WelsThreadLib.o \
+$(OBJDIR)/welsEncoderExt.o \
+$(OBJDIR)/welsCodecTrace.o
+
+ifeq ($(NASM), 1)
+OBJENC += $(OBJDIR)/cpuid.o \
+$(OBJDIR)/coeff.o \
+$(OBJDIR)/dct.o \
+$(OBJDIR)/deblock.o \
+$(OBJDIR)/expand_picture.o \
+$(OBJDIR)/intra_pred_util.o \
+$(OBJDIR)/intra_pred.o \
+$(OBJDIR)/mb_copy.o \
+$(OBJDIR)/mc_chroma.o \
+$(OBJDIR)/mc_luma.o \
+$(OBJDIR)/memzero.o \
+$(OBJDIR)/quant.o \
+$(OBJDIR)/satd_sad.o \
+$(OBJDIR)/score.o \
+$(OBJDIR)/asm_inc.o \
+$(OBJDIR)/vaa.o 
+endif
+OBJBIN=	$(OBJDIR)/read_config.o \
+$(OBJDIR)/welsenc.o
+
+BINLIB=    	$(BINDIR)/$(NAME).a
+SHAREDLIB=  	$(BINDIR)/$(NAME).so
+BIN=    	$(BINDIR)/$(NAME).exe
+
+default: depend checkdir lib dylib exe release
+
+dependencies:
+	@echo "" >dependencies
+	
+checkdir:
+	@echo 'checkdir..'
+	@if test ! -d $(OUTDIR) ; \
+	then \
+		mkdir -p $(OUTDIR) ; \
+	fi
+	@if test ! -d $(BINDIR) ; \
+	then \
+		mkdir -p $(BINDIR) ; \
+	fi
+	@if test ! -d $(OBJMAINDIR) ; \
+	then \
+		mkdir -p $(OBJMAINDIR) ; \
+	fi
+	@if test ! -d $(OBJDIR) ; \
+	then \
+		mkdir -p $(OBJDIR) ; \
+	fi
+	@echo
+
+clean:
+	@echo remove all objects
+	@rm -f $(OBJENC)
+	@rm -f $(OBJBIN)
+	@rm -f $(BINLIB)
+	@rm -f $(SHAREDLIB)
+	@rm -f $(BIN)
+
+tags:
+	@echo update tag table
+	@etags $(THREADLIBSRCDIR)/*.cpp $(COMMSRCDIR)/*.cpp $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
+	
+	
+lib:   	$(OBJENC) 
+	@echo '$(OBJENC)'
+	@echo
+	@echo 'ar cr $(BINLIB) $(OBJENC)'
+	@echo
+	@echo 'creating libraries "$(BINLIB)"'
+	@ar cr $(BINLIB) $(OBJENC)
+	@echo '... done'
+	#@echo 'cp $(BINLIB) /usr/lib'
+	#@cp $(BINLIB) /usr/lib
+	@echo
+
+
+dylib:   $(OBJDEC)
+	@echo '$(OBJENC)'
+	@echo
+	@echo '$(GCC) -shared -Wl,-Bsymbolic -m32 -o $(SHAREDLIB) $(OBJENC) $(LIBS)'
+	@echo 'creating dynamic library "$(SHAREDLIB)"'
+	@$(GCC)  -shared -Wl,-Bsymbolic -m32 -o $(SHAREDLIB) $(OBJENC)  $(LIBS)
+	@echo '... done'
+	@echo
+	
+release:
+	@echo 'release..'
+	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
+	@cp -f $(SHAREDLIB) $(OUTDIR)
+	@echo 'cp -f $(BIN) $(OUTDIR)'
+	@cp -f $(BIN) $(OUTDIR)
+	@echo
+
+exe:	$(OBJBIN)
+	@echo	
+	@echo '$(OBJBIN)'
+	@echo
+	@echo '$(GCC) $(LIBS) $(OBJBIN) $(BINLIB) -m32 -o $(BIN)'
+	@echo 'creating binary "$(BIN)"'
+	@$(GCC)  $(OBJBIN) $(BINLIB) -m32 -o $(BIN) $(LIBS)
+	@echo '... done'
+	@echo
+
+depend:
+	@echo
+	@echo 'checking dependencies'
+	@$(SHELL) -ec '$(CC) -m32 -MM $(CFLAGS) $(ENCODESRC) $(ASMSRC) $(MAINSRC)\
+         | sed '\''s@\(.*\)\.o[ :]@$(OBJDIR)/\1.o$(SUFFIX):@g'\''               \
+         >$(DEPEND)'
+	@echo
+
+$(OBJDIR)/%.o$(SUFFIX): $(THREADLIBSRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
+$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	
+$(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+
+$(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
+	@echo 'compiling object file "$@" ...'
+	@$(AS) $(ASFLAGS) -o $@ $<	
+	
+$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
+	
+$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
+	@echo 'compiling object file "$@" ...'
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
+	
+include $(DEPEND)
+
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -1,0 +1,683 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="WelsDecCore"
+	ProjectGUID="{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+		<DefaultToolFile
+			FileName="masm.rules"
+		/>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\decoder\core\release"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				InlineFunctionExpansion="1"
+				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				PrecompiledHeaderFile=".\..\..\..\obj\decoder\core\release/WelsDecCore.pch"
+				AssemblerListingLocation=".\..\..\..\obj\decoder\core\release/"
+				ObjectFile=".\..\..\..\obj\decoder\core\release/"
+				ProgramDataBaseFileName=".\..\..\..\obj\decoder\core\release/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="0"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="NDEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)\welsdcore.lib"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile=".\..\..\..\..\bin\win32\Release/WelsDecCore.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Debug"
+			IntermediateDirectory=".\..\..\..\obj\decoder\core\debug"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				PrecompiledHeaderFile=".\..\..\..\obj\decoder\core\debug/WelsDecCore.pch"
+				AssemblerListingLocation=".\..\..\..\obj\decoder\core\debug/"
+				ObjectFile=".\..\..\..\obj\decoder\core\debug/"
+				ProgramDataBaseFileName=".\..\..\..\obj\decoder\core\debug/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="_DEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)\welsdcore.lib"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile=".\..\..\..\..\bin\win32\Debug/WelsDecCore.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="SW"
+			>
+			<Filter
+				Name="asm"
+				Filter="*.asm;*.inc"
+				>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\asm_inc.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\block_add.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\cpuid.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\dct.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\deblock.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\expand_picture.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\intra_pred.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\mb_copy.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\mc_chroma.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\mc_luma.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\asm\memzero.asm"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCustomBuildTool"
+							CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							Outputs="$(IntDir)\$(InputName).obj"
+						/>
+					</FileConfiguration>
+				</File>
+			</Filter>
+			<Filter
+				Name="Header Files"
+				Filter="h;hpp;hxx;hm;inl"
+				>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\as264_common.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\au_parser.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\bit_stream.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\cpu.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\cpu_core.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\deblocking.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\dec_frame.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\dec_golomb.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\decode_mb_aux.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\decode_slice.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\decoder.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\decoder_context.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\decoder_core.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\error_code.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\expand_pic.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\fmo.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\get_intra_predictor.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\ls_defines.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\macros.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\manage_dec_ref.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\mb_cache.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\mc.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\measure_time.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\mem_align.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\memmgr_nal_unit.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\mv_pred.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\nal_prefix.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\nalu.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\parameter_sets.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\parse_mb_syn_cavlc.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\pic_queue.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\picture.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\rec_mb.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\slice.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\typedefs.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\utils.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\vlc_decoder.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\wels_common_basis.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\wels_const.h"
+					>
+				</File>
+			</Filter>
+			<Filter
+				Name="Source Files"
+				Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+				>
+				<File
+					RelativePath="..\..\..\decoder\core\src\au_parser.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\bit_stream.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\cpu.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\deblocking.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\decode_mb_aux.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\decode_slice.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\decoder.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\decoder_core.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\decoder_data_tables.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\expand_pic.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\fmo.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\get_intra_predictor.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\manage_dec_ref.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\mc.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\mem_align.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\memmgr_nal_unit.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\mv_pred.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\parse_mb_syn_cavlc.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\pic_queue.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\rec_mb.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\core\src\utils.cpp"
+					>
+				</File>
+			</Filter>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecCore_2010.vcxproj
@@ -1,0 +1,268 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}</ProjectGuid>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\decoder\core\release\</IntDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\decoder\core\debug\</IntDir>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsdcore</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsdcore</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decoder\core\release/WelsDecCore.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decoder\core\release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decoder\core\release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decoder\core\release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>
+      </DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <OutputFile>$(OutDir)\welsdcore.lib</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </Lib>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\WelsDecCore.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decoder\core\debug/WelsDecCore.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decoder\core\debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decoder\core\debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decoder\core\debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <OutputFile>$(OutDir)\welsdcore.lib</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </Lib>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\WelsDecCore.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\..\decoder\core\asm\asm_inc.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\block_add.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\cpuid.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\dct.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\deblock.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\expand_picture.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\mb_copy.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\mc_chroma.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\mc_luma.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\memzero.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\decoder\core\inc\as264_common.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\au_parser.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\bit_stream.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\cpu.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\cpu_core.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\deblocking.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decode_mb_aux.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decoder.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decoder_context.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\error_code.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\expand_pic.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\fmo.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\get_intra_predictor.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\ls_defines.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\macros.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\manage_dec_ref.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\mb_cache.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\mc.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\measure_time.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\mem_align.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\memmgr_nal_unit.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\mv_pred.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\nal_prefix.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\nalu.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\parameter_sets.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\parse_mb_syn_cavlc.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\pic_queue.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\picture.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\rec_mb.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\slice.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decode_slice.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\dec_frame.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\dec_golomb.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decoder_core.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\typedefs.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\utils.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\vlc_decoder.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\wels_common_basis.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\wels_const.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\decoder\core\src\au_parser.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\bit_stream.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\cpu.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\deblocking.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decode_mb_aux.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decoder.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decoder_data_tables.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\expand_pic.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\fmo.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\get_intra_predictor.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\manage_dec_ref.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\mc.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\mem_align.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\memmgr_nal_unit.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\mv_pred.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\parse_mb_syn_cavlc.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\pic_queue.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\rec_mb.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decode_slice.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decoder_core.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\utils.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecCore_2012.vcxproj
@@ -1,0 +1,267 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}</ProjectGuid>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir>.\..\..\..\obj\decoder\core\release\</IntDir>
+    <TargetName>welsdcore</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir>.\..\..\..\obj\decoder\core\debug\</IntDir>
+    <TargetName>welsdcore</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decoder\core\release/WelsDecCore.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decoder\core\release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decoder\core\release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decoder\core\release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat />
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <OutputFile>$(OutDir)\welsdcore.lib</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </Lib>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\welsdcore.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decoder\core\debug/WelsDecCore.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decoder\core\debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decoder\core\debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decoder\core\debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <OutputFile>$(OutDir)\welsdcore.lib</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </Lib>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\welsdcore.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\..\decoder\core\asm\asm_inc.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\block_add.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\cpuid.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\dct.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\deblock.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\expand_picture.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\mb_copy.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\mc_chroma.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\mc_luma.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\decoder\core\asm\memzero.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\decoder\core\inc\as264_common.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\au_parser.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\bit_stream.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\cpu.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\cpu_core.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\deblocking.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decode_mb_aux.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decoder.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decoder_context.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\error_code.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\expand_pic.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\fmo.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\get_intra_predictor.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\ls_defines.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\macros.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\manage_dec_ref.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\mb_cache.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\mc.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\measure_time.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\mem_align.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\memmgr_nal_unit.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\mv_pred.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\nal_prefix.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\nalu.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\parameter_sets.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\parse_mb_syn_cavlc.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\pic_queue.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\picture.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\rec_mb.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\slice.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decode_slice.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\dec_frame.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\dec_golomb.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\decoder_core.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\typedefs.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\utils.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\vlc_decoder.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\wels_common_basis.h" />
+    <ClInclude Include="..\..\..\decoder\core\inc\wels_const.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\decoder\core\src\au_parser.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\bit_stream.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\cpu.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\deblocking.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decode_mb_aux.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decoder.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decoder_data_tables.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\expand_pic.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\fmo.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\get_intra_predictor.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\manage_dec_ref.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\mc.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\mem_align.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\memmgr_nal_unit.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\mv_pred.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\parse_mb_syn_cavlc.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\pic_queue.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\rec_mb.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decode_slice.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\decoder_core.cpp" />
+    <ClCompile Include="..\..\..\decoder\core\src\utils.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecPlus.vcproj
@@ -1,0 +1,323 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="WelsDecPlus"
+	ProjectGUID="{1131558A-9986-4F4B-A13F-8B7F4C8438BF}"
+	RootNamespace="WelsDecPlus"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\decoder\plus\Release"
+			ConfigurationType="2"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				PreprocessorDefinitions="NDEBUG"
+				MkTypLibCompatible="true"
+				SuppressStartupBanner="true"
+				TargetEnvironment="1"
+				TypeLibraryName=".\..\..\..\..\..\bin\win32\Release/WelsDecPlus.tlb"
+				HeaderFileName=""
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				InlineFunctionExpansion="1"
+				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				PrecompiledHeaderFile=".\..\..\..\obj\decoder\plus\Release/WelsDecPlus.pch"
+				AssemblerListingLocation=".\..\..\..\obj\decoder\plus\Release/"
+				ObjectFile=".\..\..\..\obj\decoder\plus\Release/"
+				ProgramDataBaseFileName=".\..\..\..\obj\decoder\plus\Release/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="NDEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies=".\..\..\..\..\bin\win32\Release\welsdcore.lib"
+				OutputFile="$(OutDir)\welsdec.dll"
+				LinkIncremental="1"
+				SuppressStartupBanner="true"
+				AdditionalLibraryDirectories=".\..\..\..\libs\Release\"
+				ModuleDefinitionFile="..\..\..\decoder\plus\src\wels_dec_export.def"
+				GenerateDebugInformation="true"
+				ProgramDatabaseFile=".\..\..\..\maps\Release\welsdec.pdb"
+				GenerateMapFile="true"
+				MapFileName=".\..\..\..\maps\Release\welsdec.map"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="2"
+				ImportLibrary="$(OutDir)\welsdec.lib"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile=".\..\..\..\..\bin\win32\Release/WelsDecPlus.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Debug"
+			IntermediateDirectory=".\..\..\..\obj\decoder\plus\debug"
+			ConfigurationType="2"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				PreprocessorDefinitions="_DEBUG"
+				MkTypLibCompatible="true"
+				SuppressStartupBanner="true"
+				TargetEnvironment="1"
+				TypeLibraryName=".\..\..\..\..\..\bin\win32\Debug/WelsDecPlus.tlb"
+				HeaderFileName=""
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				PrecompiledHeaderFile=".\..\..\..\obj\decoder\plus\debug/WelsDecPlus.pch"
+				AssemblerListingLocation=".\..\..\..\obj\decoder\plus\debug/"
+				ObjectFile=".\..\..\..\obj\decoder\plus\debug/"
+				ProgramDataBaseFileName=".\..\..\..\obj\decoder\plus\debug/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="_DEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies=".\..\..\..\..\bin\win32\Debug\welsdcore.lib"
+				OutputFile="$(OutDir)\welsdec.dll"
+				LinkIncremental="1"
+				SuppressStartupBanner="true"
+				AdditionalLibraryDirectories="..\..\..\libs\debug"
+				ModuleDefinitionFile="..\..\..\decoder\plus\src\wels_dec_export.def"
+				GenerateDebugInformation="true"
+				ProgramDatabaseFile=".\..\..\..\..\bin\win32\Debug/welsdec.pdb"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="2"
+				ImportLibrary="$(OutDir)\welsdec.lib"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile=".\..\..\..\..\bin\win32\Debug/WelsDecPlus.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="SW"
+			>
+			<Filter
+				Name="Resource Files"
+				Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+				>
+				<File
+					RelativePath="..\..\..\decoder\plus\res\welsdec.rc"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCResourceCompilerTool"
+							PreprocessorDefinitions=""
+							AdditionalIncludeDirectories="\SVN_project_https\trunk\codec\Wels\project\decoder\plus\res"
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCResourceCompilerTool"
+							PreprocessorDefinitions=""
+							AdditionalIncludeDirectories="\SVN_project_https\trunk\codec\Wels\project\decoder\plus\res"
+						/>
+					</FileConfiguration>
+				</File>
+			</Filter>
+			<Filter
+				Name="Source Files"
+				Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+				>
+				<File
+					RelativePath="..\..\..\decoder\plus\src\wels_dec_export.def"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\plus\src\welsCodecTrace.cpp"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							AdditionalIncludeDirectories=""
+							PreprocessorDefinitions=""
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							AdditionalIncludeDirectories=""
+							PreprocessorDefinitions=""
+						/>
+					</FileConfiguration>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\plus\src\welsDecoderExt.cpp"
+					>
+					<FileConfiguration
+						Name="Release|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							AdditionalIncludeDirectories=""
+							PreprocessorDefinitions=""
+						/>
+					</FileConfiguration>
+					<FileConfiguration
+						Name="Debug|Win32"
+						>
+						<Tool
+							Name="VCCLCompilerTool"
+							AdditionalIncludeDirectories=""
+							PreprocessorDefinitions=""
+						/>
+					</FileConfiguration>
+				</File>
+			</Filter>
+			<Filter
+				Name="Header Files"
+				Filter="h;hpp;hxx;hm;inl"
+				>
+				<File
+					RelativePath="..\..\..\decoder\core\inc\mem_align.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\plus\inc\welsCodecTrace.h"
+					>
+				</File>
+				<File
+					RelativePath="..\..\..\decoder\plus\inc\welsDecoderExt.h"
+					>
+				</File>
+			</Filter>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecPlus_2010.vcxproj
@@ -1,0 +1,177 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{1131558A-9986-4F4B-A13F-8B7F4C8438BF}</ProjectGuid>
+    <RootNamespace>WelsDecPlus</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\decoder\plus\Release\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\decoder\plus\debug\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsdec</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsdec</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Release/WelsDecPlus.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decoder\plus\Release/WelsDecPlus.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decoder\plus\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decoder\plus\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decoder\plus\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalDependencies>$(OutDir)\welsdcore.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)\welsdec.dll</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>.\..\..\..\libs\Release\;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ModuleDefinitionFile>..\..\..\decoder\plus\src\wels_dec_export.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\welsdec.pdb</ProgramDatabaseFile>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsdec.map</MapFileName>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <ImportLibrary>$(OutDir)\welsdec.lib</ImportLibrary>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\WelsDecPlus.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Debug/WelsDecPlus.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decoder\plus\debug/WelsDecPlus.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decoder\plus\debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decoder\plus\debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decoder\plus\debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalDependencies>$(OutDir)\welsdcore.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)\welsdec.dll</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\libs\debug;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ModuleDefinitionFile>..\..\..\decoder\plus\src\wels_dec_export.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\welsdec.pdb</ProgramDatabaseFile>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <ImportLibrary>$(OutDir)\welsdec.lib</ImportLibrary>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\WelsDecPlus.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\..\decoder\plus\res\welsdec.rc">
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">\SVN_project_https\trunk\codec\Wels\project\decoder\plus\res;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">\SVN_project_https\trunk\codec\Wels\project\decoder\plus\res;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ResourceCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\decoder\plus\src\welsCodecTrace.cpp" />
+    <ClCompile Include="..\..\..\decoder\plus\src\welsDecoderExt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\decoder\plus\src\wels_dec_export.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\decoder\core\inc\mem_align.h" />
+    <ClInclude Include="..\..\..\decoder\plus\inc\welsCodecTrace.h" />
+    <ClInclude Include="..\..\..\decoder\plus\inc\welsDecoderExt.h" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecPlus_2012.vcxproj
@@ -1,0 +1,182 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{1131558A-9986-4F4B-A13F-8B7F4C8438BF}</ProjectGuid>
+    <RootNamespace>WelsDecPlus</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir>.\..\..\..\obj\decoder\plus\Release\</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>welsdec</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir>.\..\..\..\obj\decoder\plus\debug\</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>welsdec</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Release/WelsDecPlus.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decoder\plus\Release/WelsDecPlus.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decoder\plus\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decoder\plus\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decoder\plus\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalDependencies>.\..\..\..\..\bin\win32\Release\welsdcore.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)\welsdec.dll</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>.\..\..\..\libs\Release\;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ModuleDefinitionFile>..\..\..\decoder\plus\src\wels_dec_export.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\welsdec.pdb</ProgramDatabaseFile>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsdec.map</MapFileName>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <ImportLibrary>$(OutDir)\welsdec.lib</ImportLibrary>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProfileGuidedDatabase>$(OutDir)\welsdec.pgd</ProfileGuidedDatabase>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\welsdec.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Debug/WelsDecPlus.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decoder\plus\debug/WelsDecPlus.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decoder\plus\debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decoder\plus\debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decoder\plus\debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalDependencies>.\..\..\..\..\bin\win32\Debug\welsdcore.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)\welsdec.dll</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\libs\debug;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ModuleDefinitionFile>..\..\..\decoder\plus\src\wels_dec_export.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\welsdec.pdb</ProgramDatabaseFile>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <ImportLibrary>$(OutDir)\welsdec.lib</ImportLibrary>
+      <TargetMachine>MachineX86</TargetMachine>
+      <MapFileName>$(OutDir)\welsdec.map</MapFileName>
+      <ProfileGuidedDatabase>$(OutDir)\welsdec.pgd</ProfileGuidedDatabase>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\welsdec.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\..\decoder\plus\res\welsdec.rc">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">\SVN_project_https\trunk\codec\Wels\project\decoder\plus\res;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">\SVN_project_https\trunk\codec\Wels\project\decoder\plus\res;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ResourceCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\decoder\plus\src\welsCodecTrace.cpp" />
+    <ClCompile Include="..\..\..\decoder\plus\src\welsDecoderExt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\decoder\plus\src\wels_dec_export.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\decoder\core\inc\mem_align.h" />
+    <ClInclude Include="..\..\..\decoder\plus\inc\welsCodecTrace.h" />
+    <ClInclude Include="..\..\..\decoder\plus\inc\welsDecoderExt.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="WelsDecCore.vcxproj">
+      <Project>{01b4ae41-6ad6-4caf-aeb3-c42f7f9121d5}</Project>
+      <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecoder_2008.sln
@@ -1,0 +1,38 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsDecCore", "WelsDecCore.vcproj", "{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsDecPlus", "WelsDecPlus.vcproj", "{1131558A-9986-4F4B-A13F-8B7F4C8438BF}"
+	ProjectSection(ProjectDependencies) = postProject
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5} = {01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "decConsole", "decConsole.vcproj", "{71973A8E-103D-4FB7-951F-55E35E7F60FA}"
+	ProjectSection(ProjectDependencies) = postProject
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF} = {1131558A-9986-4F4B-A13F-8B7F4C8438BF}
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Debug|Win32.ActiveCfg = Debug|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Debug|Win32.Build.0 = Debug|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Release|Win32.ActiveCfg = Release|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Release|Win32.Build.0 = Release|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Debug|Win32.ActiveCfg = Debug|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Debug|Win32.Build.0 = Debug|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Release|Win32.ActiveCfg = Release|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Release|Win32.Build.0 = Release|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Debug|Win32.ActiveCfg = Debug|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Debug|Win32.Build.0 = Debug|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Release|Win32.ActiveCfg = Release|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecoder_2010.sln
@@ -1,0 +1,38 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "decConsole_2010", "decConsole_2010.vcxproj", "{71973A8E-103D-4FB7-951F-55E35E7F60FA}"
+	ProjectSection(ProjectDependencies) = postProject
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF} = {1131558A-9986-4F4B-A13F-8B7F4C8438BF}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsDecPlus_2010", "WelsDecPlus_2010.vcxproj", "{1131558A-9986-4F4B-A13F-8B7F4C8438BF}"
+	ProjectSection(ProjectDependencies) = postProject
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5} = {01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsDecCore_2010", "WelsDecCore_2010.vcxproj", "{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Debug|Win32.ActiveCfg = Debug|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Debug|Win32.Build.0 = Debug|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Release|Win32.ActiveCfg = Release|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Release|Win32.Build.0 = Release|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Debug|Win32.ActiveCfg = Debug|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Debug|Win32.Build.0 = Debug|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Release|Win32.ActiveCfg = Release|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Release|Win32.Build.0 = Release|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Debug|Win32.ActiveCfg = Debug|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Debug|Win32.Build.0 = Debug|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Release|Win32.ActiveCfg = Release|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/codec/build/win32/dec/WelsDecoder_2012.sln
@@ -1,0 +1,32 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsDecCore_2012", "WelsDecCore_2012.vcxproj", "{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsDecPlus_2012", "WelsDecPlus_2012.vcxproj", "{1131558A-9986-4F4B-A13F-8B7F4C8438BF}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "decConsole_2012", "decConsole_2012.vcxproj", "{71973A8E-103D-4FB7-951F-55E35E7F60FA}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Debug|Win32.ActiveCfg = Debug|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Debug|Win32.Build.0 = Debug|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Release|Win32.ActiveCfg = Release|Win32
+		{01B4AE41-6AD6-4CAF-AEB3-C42F7F9121D5}.Release|Win32.Build.0 = Release|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Debug|Win32.ActiveCfg = Debug|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Debug|Win32.Build.0 = Debug|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Release|Win32.ActiveCfg = Release|Win32
+		{1131558A-9986-4F4B-A13F-8B7F4C8438BF}.Release|Win32.Build.0 = Release|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Debug|Win32.ActiveCfg = Debug|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Debug|Win32.Build.0 = Debug|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Release|Win32.ActiveCfg = Release|Win32
+		{71973A8E-103D-4FB7-951F-55E35E7F60FA}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/codec/build/win32/dec/decConsole.vcproj
@@ -1,0 +1,282 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="decConsole"
+	ProjectGUID="{71973A8E-103D-4FB7-951F-55E35E7F60FA}"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\decConsole\Release"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TypeLibraryName=".\..\..\..\..\bin\win32\Release/decConsole.tlb"
+				HeaderFileName=""
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				InlineFunctionExpansion="1"
+				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				PrecompiledHeaderFile=".\..\..\..\obj\decConsole\Release/decConsole.pch"
+				AssemblerListingLocation=".\..\..\..\obj\decConsole\Release/"
+				ObjectFile=".\..\..\..\obj\decConsole\Release/"
+				ProgramDataBaseFileName=".\..\..\..\obj\decConsole\Release/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="NDEBUG"
+				Culture="2052"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				OutputFile="$(OutDir)\decConsole.exe"
+				LinkIncremental="1"
+				SuppressStartupBanner="true"
+				AdditionalLibraryDirectories="..\..\..\..\bin\win32"
+				ProgramDatabaseFile="$(OutDir)\decConsole.pdb"
+				GenerateMapFile="false"
+				SubSystem="1"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)\decConsole.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Debug"
+			IntermediateDirectory=".\..\..\..\obj\decConsole\Debug"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TypeLibraryName=".\..\..\..\..\bin\win32\Debug/decConsole.tlb"
+				HeaderFileName=""
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				PrecompiledHeaderFile=".\..\..\..\obj\decConsole\Debug/decConsole.pch"
+				AssemblerListingLocation=".\..\..\..\obj\decConsole\Debug/"
+				ObjectFile=".\..\..\..\obj\decConsole\Debug/"
+				ProgramDataBaseFileName=".\..\..\..\obj\decConsole\Debug/"
+				BrowseInformation="1"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="_DEBUG"
+				Culture="2052"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				OutputFile="$(OutDir)\decConsoled.exe"
+				LinkIncremental="2"
+				SuppressStartupBanner="true"
+				AdditionalLibraryDirectories="..\..\..\..\bin\win32"
+				GenerateDebugInformation="true"
+				ProgramDatabaseFile="$(OutDir)\decConsoled.pdb"
+				SubSystem="1"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)\decConsole.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+			>
+			<File
+				RelativePath="..\..\..\console\dec\src\d3d9_utils.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\console\dec\src\h264dec.cpp"
+				>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\console\dec\src\read_config.cpp"
+				>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl"
+			>
+			<File
+				RelativePath="..\..\..\console\dec\inc\d3d9_utils.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\console\dec\inc\read_config.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+			>
+		</Filter>
+		<File
+			RelativePath="..\..\..\..\bin\win32\Release\welsdec.cfg"
+			>
+		</File>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/build/win32/dec/decConsole_2010.vcxproj
@@ -1,0 +1,172 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{71973A8E-103D-4FB7-951F-55E35E7F60FA}</ProjectGuid>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\decConsole\Release\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\decConsole\Debug\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">decConsole</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">decConsole</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <TypeLibraryName>.\..\..\..\..\bin\Release/decConsole.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decConsole\Release/decConsole.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decConsole\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decConsole\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decConsole\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0804</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)\decConsole.exe</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\..\bin;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ProgramDatabaseFile>$(OutDir)\decConsole.pdb</ProgramDatabaseFile>
+      <GenerateMapFile>false</GenerateMapFile>
+      <SubSystem>Console</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;.\..\..\..\..\bin\win32\Release\welsdec.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\decConsole.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <TypeLibraryName>.\..\..\..\..\bin\Debug/decConsole.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decConsole\Debug/decConsole.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decConsole\Debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decConsole\Debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decConsole\Debug/</ProgramDataBaseFileName>
+      <BrowseInformation>true</BrowseInformation>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0804</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)\decConsole.exe</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\..\bin;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\decConsoled.pdb</ProgramDatabaseFile>
+      <SubSystem>Console</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;.\..\..\..\..\bin\win32\debug\welsdec.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\decConsole.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\console\dec\src\d3d9_utils.cpp" />
+    <ClCompile Include="..\..\..\console\dec\src\h264dec.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\console\dec\src\read_config.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\console\dec\inc\d3d9_utils.h" />
+    <ClInclude Include="..\..\..\console\dec\inc\read_config.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\..\bin\Release\welsdec.cfg" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="WelsDecPlus.vcxproj">
+      <Project>{1131558a-9986-4f4b-a13f-8b7f4c8438bf}</Project>
+      <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/dec/decConsole_2012.vcxproj
@@ -1,0 +1,163 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{71973A8E-103D-4FB7-951F-55E35E7F60FA}</ProjectGuid>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir>.\..\..\..\obj\decConsole\Release\</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>decConsole</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir>.\..\..\..\obj\decConsole\Debug\</IntDir>
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>decConsole</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <TypeLibraryName>.\..\..\..\..\bin\Release/decConsole.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decConsole\Release/decConsole.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decConsole\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decConsole\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decConsole\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0804</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)\decConsole.exe</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\..\bin;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ProgramDatabaseFile>$(OutDir)\decConsole.pdb</ProgramDatabaseFile>
+      <GenerateMapFile>false</GenerateMapFile>
+      <SubSystem>Console</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;$(OutDir)welsdec.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <ProfileGuidedDatabase>$(OutDir)\decConsole.pgd</ProfileGuidedDatabase>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\decConsole.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <TypeLibraryName>.\..\..\..\..\bin\Debug/decConsole.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\decConsole\Debug/decConsole.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\decConsole\Debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\decConsole\Debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\decConsole\Debug/</ProgramDataBaseFileName>
+      <BrowseInformation>true</BrowseInformation>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0804</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)\decConsole.exe</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\..\bin;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\decConsole.pdb</ProgramDatabaseFile>
+      <SubSystem>Console</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;$(OutDir)welsdec.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MapFileName>$(OutDir)\decConsole.map</MapFileName>
+      <ProfileGuidedDatabase>$(OutDir)\decConsole.pgd</ProfileGuidedDatabase>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\decConsole.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\console\dec\src\d3d9_utils.cpp" />
+    <ClCompile Include="..\..\..\console\dec\src\h264dec.cpp" />
+    <ClCompile Include="..\..\..\console\dec\src\read_config.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\console\dec\inc\d3d9_utils.h" />
+    <ClInclude Include="..\..\..\console\dec\inc\read_config.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\..\bin\Release\welsdec.cfg" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="WelsDecPlus.vcxproj">
+      <Project>{1131558a-9986-4f4b-a13f-8b7f4c8438bf}</Project>
+      <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncCore.dsp
@@ -1,0 +1,1538 @@
+# Microsoft Developer Studio Project File - Name="WelsEncCore" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Static Library" 0x0104
+
+CFG=WelsEncCore - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "WelsEncCore.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "WelsEncCore.mak" CFG="WelsEncCore - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "WelsEncCore - Win32 Release" (based on "Win32 (x86) Static Library")
+!MESSAGE "WelsEncCore - Win32 Debug" (based on "Win32 (x86) Static Library")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "..\..\..\..\..\bin\Release"
+# PROP Intermediate_Dir "..\..\..\obj\encoder\core\Release"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /YX /FD /c
+# ADD CPP /nologo /MD /W3 /GX /Zd /O2 /I "..\..\..\encoder\core\inc" /I "..\..\..\api\svc" /I "..\..\..\WelsThreadLib\api" /D "WIN32" /D "NDEBUG" /D "_MBCS" /D "_LIB" /D "WELS_SVC" /D "ENCODER_CORE" /D "HAVE_MMX" /D "HAVE_CACHE_LINE_ALIGN" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo /out:"..\..\..\..\..\bin\Release\welsecore.lib"
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "..\..\..\..\..\bin\Debug"
+# PROP Intermediate_Dir "..\..\..\obj\encoder\core\Debug"
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /YX /FD /GZ /c
+# ADD CPP /nologo /MDd /W3 /Gm /GX /ZI /Od /I "..\..\..\encoder\core\inc" /I "..\..\..\api\svc" /I "..\..\..\WelsThreadLib\api" /D "WIN32" /D "_DEBUG" /D "_MBCS" /D "_LIB" /D "WELS_SVC" /D "ENCODER_CORE" /D "HAVE_MMX" /D "HAVE_CACHE_LINE_ALIGN" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LIB32=link.exe -lib
+# ADD BASE LIB32 /nologo
+# ADD LIB32 /nologo /out:"..\..\..\..\..\bin\Debug\welsecore.lib"
+
+!ENDIF 
+
+# Begin Target
+
+# Name "WelsEncCore - Win32 Release"
+# Name "WelsEncCore - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\au_set.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\colorspace.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\cpu.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\deblocking.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\decode_mb_aux.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\downsample_yuv.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\encode_mb_aux.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\encoder.c
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# ADD CPP /D "OUPUT_REF_PIC"
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\encoder_data_tables.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\encoder_ext.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\expand_pic.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\get_intra_predictor.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\mc.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\md.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\mgs_layer_encode.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\mv_pred.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\nal_encap.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\picture_handle.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\pixel.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\property.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\ratectl.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\ref_list_mgr_svc.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\sei.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\set_mb_syn_cavlc.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\slice_multi_threading.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\svc_base_layer_md.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\svc_enc_slice_segment.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\svc_encode_mb.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\svc_encode_slice.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\svc_mode_decision.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\svc_motion_estimate.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\svc_preprocess.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\svc_set_mb_syn_cavlc.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\utils.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\src\vaa.c
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\WelsThreadLib\src\WelsThreadLib.c
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\as264_common.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\au_set.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\bit_stream.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\callback.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\colorspace.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\cpu.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\cpu_core.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\deblocking.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\decode_mb_aux.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\downsample_yuv.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\dq_map.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\encode_mb_aux.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\encoder.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\encoder_context.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\expand_pic.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\extern.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\get_intra_predictor.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\layered_pic_buffer.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\ls_defines.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\macros.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\mb_cache.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\mc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\md.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\measure_time.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\mem_align.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\mgs_layer_encode.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\mt_defs.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\mv_pred.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\nal_encap.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\nal_prefix.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\param_svc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\parameter_sets.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\picture.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\picture_handle.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\pixel.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\property.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\rc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\ref_list_mgr_svc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\sei.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\set_mb_syn_cavlc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\slice.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\slice_multi_threading.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\stat.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_base_layer_md.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_config.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_enc_frame.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_enc_golomb.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_enc_macroblock.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_enc_slice_segment.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_encode_mb.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_encode_slice.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_mode_decision.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_motion_estimate.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_preprocess.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\svc_set_mb_syn_cavlc.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\trace.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\typedefs.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\utils.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\vaa.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\vlc_encoder.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\wels_common_basis.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\inc\wels_const.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\WelsThreadLib\api\WelsThreadLib.h
+# End Source File
+# End Group
+# Begin Group "asm"
+
+# PROP Default_Filter "*.asm;*.inc"
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\accumulate_rs.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\accumulate_rs.asm
+InputName=accumulate_rs
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\accumulate_rs.asm
+InputName=accumulate_rs
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\coeff.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\coeff.asm
+InputName=coeff
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\coeff.asm
+InputName=coeff
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\coeff_level_to_dct.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\coeff_level_to_dct.asm
+InputName=coeff_level_to_dct
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\coeff_level_to_dct.asm
+InputName=coeff_level_to_dct
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\colorspace_rgb.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\colorspace_rgb.asm
+InputName=colorspace_rgb
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\colorspace_rgb.asm
+InputName=colorspace_rgb
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\colorspace_rgb_sse2.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\colorspace_rgb_sse2.asm
+InputName=colorspace_rgb_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\colorspace_rgb_sse2.asm
+InputName=colorspace_rgb_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\colorspace_yuv.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\colorspace_yuv.asm
+InputName=colorspace_yuv
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\colorspace_yuv.asm
+InputName=colorspace_yuv
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\cpu_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\cpu_mmx.asm
+InputName=cpu_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\cpu_mmx.asm
+InputName=cpu_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\dct_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\dct_mmx.asm
+InputName=dct_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\dct_mmx.asm
+InputName=dct_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\dct_sse2.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\dct_sse2.asm
+InputName=dct_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\dct_sse2.asm
+InputName=dct_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\deblock.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\deblock.asm
+InputName=deblock
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\deblock.asm
+InputName=deblock
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\dequant_sse2.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\dequant_sse2.asm
+InputName=dequant_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\dequant_sse2.asm
+InputName=dequant_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\downsampling.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\downsampling.asm
+InputName=downsampling
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\downsampling.asm
+InputName=downsampling
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\expand_picture.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\expand_picture.asm
+InputName=expand_picture
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\expand_picture.asm
+InputName=expand_picture
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\idct_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\idct_mmx.asm
+InputName=idct_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\idct_mmx.asm
+InputName=idct_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\intra_pred.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\intra_pred.asm
+InputName=intra_pred
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\intra_pred.asm
+InputName=intra_pred
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\intra_pred_util.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\intra_pred_util.asm
+InputName=intra_pred_util
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\intra_pred_util.asm
+InputName=intra_pred_util
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\mb_copy.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\mb_copy.asm
+InputName=mb_copy
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\mb_copy.asm
+InputName=mb_copy
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\mc_chroma_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\mc_chroma_mmx.asm
+InputName=mc_chroma_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\mc_chroma_mmx.asm
+InputName=mc_chroma_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\mc_copy_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\mc_copy_mmx.asm
+InputName=mc_copy_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\mc_copy_mmx.asm
+InputName=mc_copy_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\mc_hc_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\mc_hc_mmx.asm
+InputName=mc_hc_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\mc_hc_mmx.asm
+InputName=mc_hc_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\mc_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\mc_mmx.asm
+InputName=mc_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\mc_mmx.asm
+InputName=mc_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\mc_sse2.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\mc_sse2.asm
+InputName=mc_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\mc_sse2.asm
+InputName=mc_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\mc_sse2_1.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\mc_sse2_1.asm
+InputName=mc_sse2_1
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\mc_sse2_1.asm
+InputName=mc_sse2_1
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\memzero.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\memzero.asm
+InputName=memzero
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\memzero.asm
+InputName=memzero
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\pixel_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\pixel_mmx.asm
+InputName=pixel_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\pixel_mmx.asm
+InputName=pixel_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\pixel_sse2.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\pixel_sse2.asm
+InputName=pixel_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\pixel_sse2.asm
+InputName=pixel_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\predenoise.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\predenoise.asm
+InputName=predenoise
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\predenoise.asm
+InputName=predenoise
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\quant_mmx.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\quant_mmx.asm
+InputName=quant_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\quant_mmx.asm
+InputName=quant_mmx
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\quant_sse2.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\quant_sse2.asm
+InputName=quant_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+InputDir=\dev\tune\codec\Wels\project\encoder\core\asm
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\quant_sse2.asm
+InputName=quant_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -I.\..\..\..\common\asm\ -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\score.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\score.asm
+InputName=score
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\score.asm
+InputName=score
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\sse2inc.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\sse2inc.asm
+InputName=sse2inc
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\sse2inc.asm
+InputName=sse2inc
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\sub_scal_coeff2.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\sub_scal_coeff2.asm
+InputName=sub_scal_coeff2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\sub_scal_coeff2.asm
+InputName=sub_scal_coeff2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\core\asm\vaa_sse2.asm
+
+!IF  "$(CFG)" == "WelsEncCore - Win32 Release"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Release
+InputPath=..\..\..\encoder\core\asm\vaa_sse2.asm
+InputName=vaa_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ELSEIF  "$(CFG)" == "WelsEncCore - Win32 Debug"
+
+# Begin Custom Build
+IntDir=.\..\..\..\obj\encoder\core\Debug
+InputPath=..\..\..\encoder\core\asm\vaa_sse2.asm
+InputName=vaa_sse2
+
+"$(IntDir)\$(InputName).obj" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	nasm -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# End Group
+# End Target
+# End Project
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -1,0 +1,1405 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="WelsEncCore"
+	ProjectGUID="{59208004-1774-4816-AC24-31FF44C324B4}"
+	RootNamespace="WelsEncCore"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+		<DefaultToolFile
+			FileName="masm.rules"
+		/>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Debug"
+			IntermediateDirectory=".\..\..\..\obj\encoder\core\Debug"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api;"
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encoder\core\Debug/"
+				ObjectFile=".\..\..\..\obj\encoder\core\Debug/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Debug/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="_DEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				OutputFile="$(OutDir)\welsecore.lib"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)\WelsEncCore.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				CommandLine=""
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\encoder\core\Release"
+			ConfigurationType="4"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				InlineFunctionExpansion="2"
+				FavorSizeOrSpeed="1"
+				WholeProgramOptimization="true"
+				AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api"
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Release/WelsEncCore.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encoder\core\Release/"
+				ObjectFile=".\..\..\..\obj\encoder\core\Release/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Release/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="NDEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				AdditionalOptions="/LTCG"
+				OutputFile="$(OutDir)\welsecore.lib"
+				SuppressStartupBanner="true"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)\WelsEncCore.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				CommandLine=""
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+			>
+			<File
+				RelativePath="..\..\..\encoder\core\src\au_set.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\cpu.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\deblocking.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\decode_mb_aux.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\encode_mb_aux.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\encoder.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions="OUPUT_REF_PIC"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\encoder_data_tables.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\encoder_ext.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\expand_pic.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\get_intra_predictor.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\mc.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\md.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\memory_align.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\mv_pred.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\nal_encap.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\picture_handle.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\property.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\ratectl.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\ref_list_mgr_svc.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\sample.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\set_mb_syn_cavlc.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\slice_multi_threading.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\svc_base_layer_md.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\svc_enc_slice_segment.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\svc_encode_mb.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\svc_encode_slice.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\svc_mode_decision.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\svc_motion_estimate.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\svc_set_mb_syn_cavlc.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\utils.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\WelsThreadLib\src\WelsThreadLib.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl"
+			>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\array_stack_align.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\as264_common.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\au_set.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\bit_stream.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\cpu.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\cpu_core.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\deblocking.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\decode_mb_aux.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\dq_map.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\encode_mb_aux.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\encoder.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\encoder_context.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\expand_pic.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\extern.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\get_intra_predictor.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\ls_defines.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\macros.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\mb_cache.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\mc.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\md.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\measure_time.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\memory_align.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\mt_defs.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\mv_pred.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\nal_encap.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\nal_prefix.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\param_svc.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\parameter_sets.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\picture.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\picture_handle.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\property.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\rc.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\ref_list_mgr_svc.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\sample.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\set_mb_syn_cavlc.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\slice.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\slice_multi_threading.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\stat.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_base_layer_md.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_enc_frame.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_enc_golomb.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_enc_macroblock.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_enc_slice_segment.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_encode_mb.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_encode_slice.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_mode_decision.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_motion_estimate.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\svc_set_mb_syn_cavlc.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\trace.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\typedefs.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\utils.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\vlc_encoder.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\wels_common_basis.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\wels_const.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\wels_func_ptr_def.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\WelsThreadLib\api\WelsThreadLib.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="asm"
+			Filter="*.asm;*.inc"
+			>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\asm_inc.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\coeff.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\cpuid.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\dct.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\deblock.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\expand_picture.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\intra_pred.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\intra_pred_util.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\mb_copy.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\mc_chroma.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\mc_luma.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\memzero.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\quant.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\satd_sad.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\score.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\asm\vaa.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+		<Filter
+			Name="PreProcess"
+			>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\IWelsVP.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\src\wels_preprocess.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\core\inc\wels_preprocess.h"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncCore_2010.vcxproj
@@ -1,0 +1,485 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{59208004-1774-4816-AC24-31FF44C324B4}</ProjectGuid>
+    <RootNamespace>WelsEncCore</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\..\bin\win32\Debug\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\encoder\core\Debug\</IntDir>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\..\bin\win32\Release\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\encoder\core\Release\</IntDir>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsecore</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsecore</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encoder\core\Debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encoder\core\Debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encoder\core\Debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <OutputFile>..\..\..\..\libs\welsecore.lib</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </Lib>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>.\..\..\..\..\..\bin\Debug/WelsEncCore.bsc</OutputFile>
+    </Bscmake>
+    <PostBuildEvent>
+      <Command>IF EXIST "$(SolutionDir)..\..\bin\$(Configuration)"  copy $(SolutionDir)..\..\bin\$(Configuration)\*.*  $(SolutionDir)\..\..\..\..\..\bin\$(Configuration)\</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <AdditionalIncludeDirectories>..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\core\Release/WelsEncCore.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encoder\core\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encoder\core\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encoder\core\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalOptions>/LTCG %(AdditionalOptions)</AdditionalOptions>
+      <OutputFile>..\..\..\..\libs\welsecore.lib</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </Lib>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>.\..\..\..\..\..\bin\Release/WelsEncCore.bsc</OutputFile>
+    </Bscmake>
+    <PostBuildEvent>
+      <Command>IF EXIST "$(SolutionDir)..\..\bin\$(Configuration)"  copy $(SolutionDir)..\..\bin\$(Configuration)\*.*  $(SolutionDir)\..\..\..\..\..\bin\$(Configuration)\</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\encoder\core\src\au_set.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\cpu.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\deblocking.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\decode_mb_aux.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\encode_mb_aux.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\encoder.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">OUPUT_REF_PIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\encoder_data_tables.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\encoder_ext.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\expand_pic.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\get_intra_predictor.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\mc.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\md.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\memory_align.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\mv_pred.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\nal_encap.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\picture_handle.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\property.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\ratectl.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\ref_list_mgr_svc.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\sample.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\set_mb_syn_cavlc.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\slice_multi_threading.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\svc_base_layer_md.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\svc_enc_slice_segment.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\svc_encode_mb.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\svc_encode_slice.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\svc_mode_decision.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\svc_motion_estimate.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\svc_set_mb_syn_cavlc.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\utils.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\WelsThreadLib\src\WelsThreadLib.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\wels_preprocess.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\encoder\core\inc\array_stack_align.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\as264_common.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\au_set.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\bit_stream.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\cpu.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\cpu_core.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\deblocking.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\decode_mb_aux.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\dq_map.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\encode_mb_aux.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\encoder.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\encoder_context.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\expand_pic.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\extern.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\get_intra_predictor.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\ls_defines.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\macros.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mb_cache.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\md.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\measure_time.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\memory_align.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mgs_layer_encode.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mt_defs.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mv_pred.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\nal_encap.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\nal_prefix.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\param_svc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\parameter_sets.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\picture.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\picture_handle.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\property.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\rc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\ref_list_mgr_svc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\sample.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\set_mb_syn_cavlc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\slice.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\slice_multi_threading.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\stat.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_base_layer_md.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_enc_frame.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_enc_golomb.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_enc_macroblock.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_enc_slice_segment.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_encode_mb.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_encode_slice.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_mode_decision.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_motion_estimate.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_set_mb_syn_cavlc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\trace.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\typedefs.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\utils.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\vlc_encoder.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\wels_common_basis.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\wels_const.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\wels_func_ptr_def.h" />
+    <ClInclude Include="..\..\..\WelsThreadLib\api\WelsThreadLib.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\IWelsVP.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\wels_preprocess.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\..\encoder\core\asm\asm_inc.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\coeff.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\cpuid.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\dct.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\deblock.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\expand_picture.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred_util.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\mb_copy.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\mc_chroma.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\mc_luma.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\vaa.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncCore_2012.vcxproj
@@ -1,0 +1,352 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{59208004-1774-4816-AC24-31FF44C324B4}</ProjectGuid>
+    <RootNamespace>WelsEncCore</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir>.\..\..\..\obj\encoder\core\Debug\</IntDir>
+    <TargetName>welsecore</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir>.\..\..\..\obj\encoder\core\Release\</IntDir>
+    <TargetName>welsecore</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encoder\core\Debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encoder\core\Debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encoder\core\Debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <OutputFile>$(OutDir)\welsecore.lib</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </Lib>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\welsecore.bsc</OutputFile>
+    </Bscmake>
+    <PostBuildEvent>
+      <Command>IF EXIST "$(SolutionDir)..\..\bin\$(Configuration)"  copy $(SolutionDir)..\..\bin\$(Configuration)\*.*  $(SolutionDir)\..\..\..\..\..\bin\$(Configuration)\</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <AdditionalIncludeDirectories>..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\core\Release/WelsEncCore.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encoder\core\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encoder\core\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encoder\core\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Lib>
+      <AdditionalOptions>/LTCG %(AdditionalOptions)</AdditionalOptions>
+      <OutputFile>$(OutDir)\welsecore.lib</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+    </Lib>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\welsecore.bsc</OutputFile>
+    </Bscmake>
+    <PostBuildEvent>
+      <Command>IF EXIST "$(SolutionDir)..\..\bin\$(Configuration)"  copy $(SolutionDir)..\..\bin\$(Configuration)\*.*  $(SolutionDir)\..\..\..\..\..\bin\$(Configuration)\</Command>
+    </PostBuildEvent>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\encoder\core\src\au_set.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\cpu.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\deblocking.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\decode_mb_aux.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\encode_mb_aux.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\encoder.cpp">
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">OUPUT_REF_PIC;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\core\src\encoder_data_tables.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\encoder_ext.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\expand_pic.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\get_intra_predictor.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\mc.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\md.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\memory_align.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\mv_pred.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\nal_encap.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\picture_handle.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\property.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\ratectl.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\ref_list_mgr_svc.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\sample.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\set_mb_syn_cavlc.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\slice_multi_threading.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\svc_base_layer_md.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\svc_enc_slice_segment.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\svc_encode_mb.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\svc_encode_slice.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\svc_mode_decision.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\svc_motion_estimate.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\svc_set_mb_syn_cavlc.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\utils.cpp" />
+    <ClCompile Include="..\..\..\WelsThreadLib\src\WelsThreadLib.cpp" />
+    <ClCompile Include="..\..\..\encoder\core\src\wels_preprocess.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\encoder\core\inc\array_stack_align.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\as264_common.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\au_set.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\bit_stream.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\cpu.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\cpu_core.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\deblocking.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\decode_mb_aux.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\dq_map.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\encode_mb_aux.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\encoder.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\encoder_context.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\expand_pic.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\extern.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\get_intra_predictor.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\ls_defines.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\macros.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mb_cache.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\md.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\measure_time.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\memory_align.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mgs_layer_encode.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mt_defs.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\mv_pred.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\nal_encap.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\nal_prefix.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\param_svc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\parameter_sets.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\picture.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\picture_handle.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\property.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\rc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\ref_list_mgr_svc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\sample.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\set_mb_syn_cavlc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\slice.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\slice_multi_threading.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\stat.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_base_layer_md.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_enc_frame.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_enc_golomb.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_enc_macroblock.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_enc_slice_segment.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_encode_mb.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_encode_slice.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_mode_decision.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_motion_estimate.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\svc_set_mb_syn_cavlc.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\trace.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\typedefs.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\utils.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\vlc_encoder.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\wels_common_basis.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\wels_const.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\wels_func_ptr_def.h" />
+    <ClInclude Include="..\..\..\WelsThreadLib\api\WelsThreadLib.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\IWelsVP.h" />
+    <ClInclude Include="..\..\..\encoder\core\inc\wels_preprocess.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\..\encoder\core\asm\asm_inc.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\coeff.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\cpuid.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\dct.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\deblock.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\expand_picture.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred_util.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\mb_copy.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\mc_chroma.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\mc_luma.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\..\encoder\core\asm\vaa.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncPlus.dsp
@@ -1,0 +1,139 @@
+# Microsoft Developer Studio Project File - Name="WelsEncPlus" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Dynamic-Link Library" 0x0102
+
+CFG=WelsEncPlus - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "WelsEncPlus.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "WelsEncPlus.mak" CFG="WelsEncPlus - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "WelsEncPlus - Win32 Release" (based on "Win32 (x86) Dynamic-Link Library")
+!MESSAGE "WelsEncPlus - Win32 Debug" (based on "Win32 (x86) Dynamic-Link Library")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=xicl6.exe
+MTL=midl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "WelsEncPlus - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "..\..\..\..\..\bin\Release"
+# PROP Intermediate_Dir "..\..\..\obj\encoder\plus\Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "WELSENCPLUS_EXPORTS" /YX /FD /c
+# ADD CPP /nologo /MD /W3 /GX /Zd /O2 /I "..\..\..\encoder\plus\inc" /I "..\..\..\encoder\core\inc" /I "..\..\..\api\svc" /I "..\..\..\WelsThreadLib\api" /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "WELSENCPLUS_EXPORTS" /D "ENCODER_CORE" /D "HAVE_CACHE_LINE_ALIGN" /FD /c
+# SUBTRACT CPP /X /YX
+# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32
+# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=xilink6.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib welsecore.lib /nologo /dll /map:"..\..\..\..\..\maps\Release/welsenc.map" /machine:I386 /out:"..\..\..\..\..\bin\Release\welsenc.dll" /libpath:"..\..\..\..\..\bin\Release" /MAPINFO:lines /MAPINFO:exports
+# SUBTRACT LINK32 /pdb:none /debug
+
+!ELSEIF  "$(CFG)" == "WelsEncPlus - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "..\..\..\..\..\bin\Debug"
+# PROP Intermediate_Dir "..\..\..\obj\encoder\plus\Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "WELSENCPLUS_EXPORTS" /YX /FD /GZ /c
+# ADD CPP /nologo /MDd /W3 /Gm /GX /ZI /Od /I "..\..\..\encoder\plus\inc" /I "..\..\..\encoder\core\inc" /I "..\..\..\api\svc" /I "..\..\..\WelsThreadLib\api" /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "WELSENCPLUS_EXPORTS" /D "ENCODER_CORE" /D "HAVE_MMX" /D "HAVE_CACHE_LINE_ALIGN" /FD /GZ /c
+# SUBTRACT CPP /YX
+# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
+# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=xilink6.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /dll /debug /machine:I386 /out:"..\..\..\..\..\bin\Debug\welsenc.dll" /pdbtype:sept
+# SUBTRACT LINK32 /nodefaultlib
+
+!ENDIF 
+
+# Begin Target
+
+# Name "WelsEncPlus - Win32 Release"
+# Name "WelsEncPlus - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\encoder\plus\src\DllEntry.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\plus\src\welsEncoderExt.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\plus\src\welsCodecTrace.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\plus\src\wels_enc_export.def
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\common\inc\mem_align.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\plus\inc\welsEncoderExt.h
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\encoder\plus\inc\welsCodecTrace.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# Begin Source File
+
+SOURCE=..\..\..\encoder\plus\res\welsenc.rc
+# End Source File
+# End Group
+# End Target
+# End Project
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncPlus.vcproj
@@ -1,0 +1,344 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="WelsEncPlus"
+	ProjectGUID="{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}"
+	RootNamespace="WelsEncPlus"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Debug"
+			IntermediateDirectory=".\..\..\..\obj\encoder\plus\Debug"
+			ConfigurationType="2"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				PreprocessorDefinitions="_DEBUG"
+				MkTypLibCompatible="true"
+				SuppressStartupBanner="true"
+				TargetEnvironment="1"
+				TypeLibraryName=".\..\..\..\..\..\bin\Debug/WelsEncPlus.tlb"
+				HeaderFileName=""
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api"
+				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Debug/WelsEncPlus.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Debug/"
+				ObjectFile=".\..\..\..\obj\encoder\plus\Debug/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Debug/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="_DEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="$(OutDir)\welsecore.lib"
+				OutputFile="$(OutDir)\welsenc.dll"
+				LinkIncremental="2"
+				SuppressStartupBanner="true"
+				AdditionalLibraryDirectories="..\..\..\..\libs"
+				ModuleDefinitionFile="..\..\..\encoder\plus\src\wels_enc_export.def"
+				GenerateDebugInformation="true"
+				ProgramDatabaseFile="$(OutDir)\welsenc.pdb"
+				GenerateMapFile="true"
+				MapFileName="$(OutDir)\welsenc.map"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="2"
+				ImportLibrary="$(OutDir)\welsenc.lib"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)/WelsEncPlus.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\encoder\plus\Release"
+			ConfigurationType="2"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				PreprocessorDefinitions="NDEBUG"
+				MkTypLibCompatible="true"
+				SuppressStartupBanner="true"
+				TargetEnvironment="1"
+				TypeLibraryName=".\..\..\..\..\..\bin\Release/WelsEncPlus.tlb"
+				HeaderFileName=""
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				InlineFunctionExpansion="2"
+				FavorSizeOrSpeed="1"
+				EnableFiberSafeOptimizations="true"
+				WholeProgramOptimization="true"
+				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Release/WelsEncPlus.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Release/"
+				ObjectFile=".\..\..\..\obj\encoder\plus\Release/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Release/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="NDEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalOptions="/MAPINFO:exports /LTCG"
+				AdditionalDependencies="$(OutDir)\welsecore.lib"
+				OutputFile="$(OutDir)\welsenc.dll"
+				LinkIncremental="1"
+				SuppressStartupBanner="true"
+				AdditionalLibraryDirectories="..\..\..\..\libs"
+				ModuleDefinitionFile="..\..\..\encoder\plus\src\wels_enc_export.def"
+				GenerateDebugInformation="true"
+				ProgramDatabaseFile="$(OutDir)\welsenc.pdb"
+				GenerateMapFile="false"
+				MapFileName=""
+				MapExports="false"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="2"
+				ImportLibrary="$(OutDir)\welsenc.lib"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)/WelsEncPlus.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+			>
+			<File
+				RelativePath="..\..\..\encoder\plus\src\DllEntry.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\plus\src\wels_enc_export.def"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\plus\src\welsCodecTrace.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\plus\src\welsEncoderExt.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl"
+			>
+			<File
+				RelativePath="..\..\..\encoder\plus\inc\welsCodecTrace.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\..\encoder\plus\inc\welsEncoderExt.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+			>
+			<File
+				RelativePath="..\..\..\encoder\plus\res\welsenc.rc"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCResourceCompilerTool"
+						PreprocessorDefinitions=""
+						AdditionalIncludeDirectories="\Project\svc_perf_opt_b\codec\Wels\project\encoder\plus\res"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCResourceCompilerTool"
+						PreprocessorDefinitions=""
+						AdditionalIncludeDirectories="\Project\svc_perf_opt_b\codec\Wels\project\encoder\plus\res"
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncPlus_2010.vcxproj
@@ -1,0 +1,194 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}</ProjectGuid>
+    <RootNamespace>WelsEncPlus</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\encoder\plus\Debug\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\encoder\plus\Release\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsenc</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsenc</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Debug/WelsEncPlus.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\plus\Debug/WelsEncPlus.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encoder\plus\Debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encoder\plus\Debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encoder\plus\Debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalDependencies>welsecore.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)\welsenc.dll</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\..\libs;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ModuleDefinitionFile>..\..\..\encoder\plus\src\wels_enc_export.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\welsenc.pdb</ProgramDatabaseFile>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <ImportLibrary>$(OutDir)\welsenc.lib</ImportLibrary>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>.\..\..\..\..\..\bin\Debug/WelsEncPlus.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Release/WelsEncPlus.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\plus\Release/WelsEncPlus.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encoder\plus\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encoder\plus\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encoder\plus\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalOptions>/MAPINFO:exports /LTCG %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalDependencies>welsecore.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)\welsenc.dll</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\..\libs;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ModuleDefinitionFile>..\..\..\encoder\plus\src\wels_enc_export.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\welsenc.pdb</ProgramDatabaseFile>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsenc.map</MapFileName>
+      <MapExports>true</MapExports>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <ImportLibrary>$(OutDir)\welsenc.lib</ImportLibrary>
+      <TargetMachine>MachineX86</TargetMachine>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>.\..\..\..\..\..\bin\Release/WelsEncPlus.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\encoder\plus\src\DllEntry.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\encoder\plus\src\welsCodecTrace.cpp" />
+    <ClCompile Include="..\..\..\encoder\plus\src\welsEncoderExt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\encoder\plus\src\wels_enc_export.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\common\inc\mem_align.h" />
+    <ClInclude Include="..\..\..\encoder\plus\inc\welsCodecTrace.h" />
+    <ClInclude Include="..\..\..\encoder\plus\inc\welsEncoderExt.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\..\encoder\plus\res\welsenc.rc">
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">\Project\svc_perf_opt_b\codec\Wels\project\encoder\plus\res;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">\Project\svc_perf_opt_b\codec\Wels\project\encoder\plus\res;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ResourceCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="WelsEncCore.vcxproj">
+      <Project>{59208004-1774-4816-ac24-31ff44c324b4}</Project>
+      <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncPlus_2012.vcxproj
@@ -1,0 +1,181 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}</ProjectGuid>
+    <RootNamespace>WelsEncPlus</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir>.\..\..\..\obj\encoder\plus\Debug\</IntDir>
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>welsenc</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir>.\..\..\..\obj\encoder\plus\Release\</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>welsenc</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Debug/WelsEncPlus.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\plus\Debug/WelsEncPlus.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encoder\plus\Debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encoder\plus\Debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encoder\plus\Debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalDependencies>$(OutDir)\welsecore.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)\welsenc.dll</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\..\libs;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ModuleDefinitionFile>..\..\..\encoder\plus\src\wels_enc_export.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\welsenc.pdb</ProgramDatabaseFile>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <ImportLibrary>$(OutDir)\welsenc.lib</ImportLibrary>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProfileGuidedDatabase>$(OutDir)\welsenc.pgd</ProfileGuidedDatabase>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\welsenc.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MkTypLibCompatible>true</MkTypLibCompatible>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <TargetEnvironment>Win32</TargetEnvironment>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Release/WelsEncPlus.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <AdditionalIncludeDirectories>..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\plus\Release/WelsEncPlus.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encoder\plus\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encoder\plus\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encoder\plus\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalOptions>/MAPINFO:exports /LTCG %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalDependencies>$(OutDir)\welsecore.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)\welsenc.dll</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\..\libs;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <ModuleDefinitionFile>..\..\..\encoder\plus\src\wels_enc_export.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\welsenc.pdb</ProgramDatabaseFile>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsenc.map</MapFileName>
+      <MapExports>true</MapExports>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <ImportLibrary>$(OutDir)\welsenc.lib</ImportLibrary>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProfileGuidedDatabase>$(OutDir)\welsenc.pgd</ProfileGuidedDatabase>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\welsenc.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\encoder\plus\src\DllEntry.cpp" />
+    <ClCompile Include="..\..\..\encoder\plus\src\welsCodecTrace.cpp" />
+    <ClCompile Include="..\..\..\encoder\plus\src\welsEncoderExt.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\..\encoder\plus\src\wels_enc_export.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\common\inc\mem_align.h" />
+    <ClInclude Include="..\..\..\encoder\plus\inc\welsEncoderExt.h" />
+    <ClInclude Include="..\..\..\encoder\plus\inc\welsCodecTrace.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\..\encoder\plus\res\welsenc.rc">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">\Project\svc_perf_opt_b\codec\Wels\project\encoder\plus\res;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">\Project\svc_perf_opt_b\codec\Wels\project\encoder\plus\res;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+    </ResourceCompile>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncoder.dsw
@@ -1,0 +1,59 @@
+Microsoft Developer Studio Workspace File, Format Version 6.00
+# WARNING: DO NOT EDIT OR DELETE THIS WORKSPACE FILE!
+
+###############################################################################
+
+Project: "WelsEncCore"=.\WelsEncCore.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+}}}
+
+###############################################################################
+
+Project: "WelsEncPlus"=.\WelsEncPlus.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name WelsEncCore
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Project: "encConsole"=.\encConsole.dsp - Package Owner=<4>
+
+Package=<5>
+{{{
+}}}
+
+Package=<4>
+{{{
+    Begin Project Dependency
+    Project_Dep_Name WelsEncPlus
+    End Project Dependency
+}}}
+
+###############################################################################
+
+Global:
+
+Package=<5>
+{{{
+}}}
+
+Package=<3>
+{{{
+}}}
+
+###############################################################################
+
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncoder_2008.sln
@@ -1,0 +1,48 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsEncCore", "WelsEncCore.vcproj", "{59208004-1774-4816-AC24-31FF44C324B4}"
+	ProjectSection(ProjectDependencies) = postProject
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsEncPlus", "WelsEncPlus.vcproj", "{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}"
+	ProjectSection(ProjectDependencies) = postProject
+		{59208004-1774-4816-AC24-31FF44C324B4} = {59208004-1774-4816-AC24-31FF44C324B4}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "encConsole", "encConsole.vcproj", "{8509E2A8-2CBD-49E2-B564-3EFF1E927459}"
+	ProjectSection(ProjectDependencies) = postProject
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F} = {1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "..\..\..\..\processing\build\win32\WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{59208004-1774-4816-AC24-31FF44C324B4}.Debug|Win32.ActiveCfg = Debug|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Debug|Win32.Build.0 = Debug|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Release|Win32.ActiveCfg = Release|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Release|Win32.Build.0 = Release|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Debug|Win32.Build.0 = Debug|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Release|Win32.ActiveCfg = Release|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Release|Win32.Build.0 = Release|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Debug|Win32.ActiveCfg = Debug|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Debug|Win32.Build.0 = Debug|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Release|Win32.ActiveCfg = Release|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Release|Win32.Build.0 = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncoder_2010.sln
@@ -1,0 +1,41 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsEncCore_2010", "WelsEncCore_2010.vcxproj", "{59208004-1774-4816-AC24-31FF44C324B4}"
+	ProjectSection(ProjectDependencies) = postProject
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsEncPlus_2010", "WelsEncPlus_2010.vcxproj", "{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "encConsole_2010", "encConsole_2010.vcxproj", "{8509E2A8-2CBD-49E2-B564-3EFF1E927459}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "..\..\..\..\processing\build\win32\WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{59208004-1774-4816-AC24-31FF44C324B4}.Debug|Win32.ActiveCfg = Debug|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Debug|Win32.Build.0 = Debug|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Release|Win32.ActiveCfg = Release|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Release|Win32.Build.0 = Release|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Debug|Win32.Build.0 = Debug|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Release|Win32.ActiveCfg = Release|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Release|Win32.Build.0 = Release|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Debug|Win32.ActiveCfg = Debug|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Debug|Win32.Build.0 = Debug|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Release|Win32.ActiveCfg = Release|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Release|Win32.Build.0 = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/codec/build/win32/enc/WelsEncoder_2012.sln
@@ -1,0 +1,44 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsEncPlus_2012", "WelsEncPlus_2012.vcxproj", "{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}"
+	ProjectSection(ProjectDependencies) = postProject
+		{59208004-1774-4816-AC24-31FF44C324B4} = {59208004-1774-4816-AC24-31FF44C324B4}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsEncCore_2012", "WelsEncCore_2012.vcxproj", "{59208004-1774-4816-AC24-31FF44C324B4}"
+	ProjectSection(ProjectDependencies) = postProject
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2012", "..\..\..\..\processing\build\win32\WelsVP_2012.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "encConsole_2012", "encConsole_2012.vcxproj", "{8509E2A8-2CBD-49E2-B564-3EFF1E927459}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Debug|Win32.ActiveCfg = Debug|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Debug|Win32.Build.0 = Debug|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Release|Win32.ActiveCfg = Release|Win32
+		{1E7B4E9A-986E-4167-8C70-6E4F60EAEE7F}.Release|Win32.Build.0 = Release|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Debug|Win32.ActiveCfg = Debug|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Debug|Win32.Build.0 = Debug|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Release|Win32.ActiveCfg = Release|Win32
+		{59208004-1774-4816-AC24-31FF44C324B4}.Release|Win32.Build.0 = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Debug|Win32.ActiveCfg = Debug|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Debug|Win32.Build.0 = Debug|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Release|Win32.ActiveCfg = Release|Win32
+		{8509E2A8-2CBD-49E2-B564-3EFF1E927459}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/codec/build/win32/enc/encConsole.dsp
@@ -1,0 +1,127 @@
+# Microsoft Developer Studio Project File - Name="encConsole" - Package Owner=<4>
+# Microsoft Developer Studio Generated Build File, Format Version 6.00
+# ** DO NOT EDIT **
+
+# TARGTYPE "Win32 (x86) Console Application" 0x0103
+
+CFG=encConsole - Win32 Debug
+!MESSAGE This is not a valid makefile. To build this project using NMAKE,
+!MESSAGE use the Export Makefile command and run
+!MESSAGE 
+!MESSAGE NMAKE /f "encConsole.mak".
+!MESSAGE 
+!MESSAGE You can specify a configuration when running NMAKE
+!MESSAGE by defining the macro CFG on the command line. For example:
+!MESSAGE 
+!MESSAGE NMAKE /f "encConsole.mak" CFG="encConsole - Win32 Debug"
+!MESSAGE 
+!MESSAGE Possible choices for configuration are:
+!MESSAGE 
+!MESSAGE "encConsole - Win32 Release" (based on "Win32 (x86) Console Application")
+!MESSAGE "encConsole - Win32 Debug" (based on "Win32 (x86) Console Application")
+!MESSAGE 
+
+# Begin Project
+# PROP AllowPerConfigDependencies 0
+# PROP Scc_ProjName ""
+# PROP Scc_LocalPath ""
+CPP=cl.exe
+RSC=rc.exe
+
+!IF  "$(CFG)" == "encConsole - Win32 Release"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 0
+# PROP BASE Output_Dir "Release"
+# PROP BASE Intermediate_Dir "Release"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 0
+# PROP Output_Dir "..\..\..\..\..\bin\Release"
+# PROP Intermediate_Dir "..\..\..\obj\encConsole\Release"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
+# ADD CPP /nologo /MD /W3 /GX /O2 /I "..\..\..\console\enc\inc" /I "..\..\..\api\svc" /I "..\..\..\encoder\core\inc" /I "..\..\..\common\inc" /I "..\..\..\WelsThreadLib\api" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /D "ENCODER_CORE" /YX /FD /c
+# ADD BASE RSC /l 0x804 /d "NDEBUG"
+# ADD RSC /l 0x409 /d "NDEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /libpath:"..\..\..\bin"
+# SUBTRACT LINK32 /debug
+
+!ELSEIF  "$(CFG)" == "encConsole - Win32 Debug"
+
+# PROP BASE Use_MFC 0
+# PROP BASE Use_Debug_Libraries 1
+# PROP BASE Output_Dir "Debug"
+# PROP BASE Intermediate_Dir "Debug"
+# PROP BASE Target_Dir ""
+# PROP Use_MFC 0
+# PROP Use_Debug_Libraries 1
+# PROP Output_Dir "..\..\..\..\..\bin\Debug"
+# PROP Intermediate_Dir "..\..\..\obj\encConsole\Debug"
+# PROP Ignore_Export_Lib 0
+# PROP Target_Dir ""
+# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
+# ADD CPP /nologo /MDd /W3 /Gm /GX /ZI /Od /I "..\..\..\console\enc\inc" /I "..\..\..\api\svc" /I "..\..\..\encoder\core\inc" /I "..\..\..\common\inc" /I "..\..\..\WelsThreadLib\api" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /D "ENCODER_CORE" /YX /FD /GZ /c
+# ADD BASE RSC /l 0x804 /d "_DEBUG"
+# ADD RSC /l 0x409 /d "_DEBUG"
+BSC32=bscmake.exe
+# ADD BASE BSC32 /nologo
+# ADD BSC32 /nologo
+LINK32=link.exe
+# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
+# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\..\..\bin"
+
+!ENDIF 
+
+# Begin Target
+
+# Name "encConsole - Win32 Release"
+# Name "encConsole - Win32 Debug"
+# Begin Group "Source Files"
+
+# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+# Begin Source File
+
+SOURCE=..\..\..\console\enc\src\read_config.cpp
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\console\enc\src\welsenc.cpp
+# End Source File
+# End Group
+# Begin Group "Header Files"
+
+# PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=..\..\..\console\enc\inc\read_config.h
+# End Source File
+# End Group
+# Begin Group "Resource Files"
+
+# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+# End Group
+# Begin Source File
+
+SOURCE=..\..\..\bin\layer0.cfg
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\bin\layer1.cfg
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\bin\layer2.cfg
+# End Source File
+# Begin Source File
+
+SOURCE=..\..\..\bin\welsenc.cfg
+# End Source File
+# End Target
+# End Project
--- /dev/null
+++ b/codec/build/win32/enc/encConsole.vcproj
@@ -1,0 +1,278 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="encConsole"
+	ProjectGUID="{8509E2A8-2CBD-49E2-B564-3EFF1E927459}"
+	RootNamespace="encConsole"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Debug"
+			IntermediateDirectory=".\..\..\..\obj\encConsole\Debug"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TypeLibraryName=".\..\..\..\..\..\bin\Debug/encConsole.tlb"
+				HeaderFileName=""
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;ENCODER_CORE;MT_ENABLED;"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				PrecompiledHeaderFile=".\..\..\..\obj\encConsole\Debug/encConsole.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encConsole\Debug/"
+				ObjectFile=".\..\..\..\obj\encConsole\Debug/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encConsole\Debug/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="_DEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="$(OutDir)\welsenc.lib"
+				OutputFile="$(OutDir)\encConsole.exe"
+				LinkIncremental="2"
+				SuppressStartupBanner="true"
+				AdditionalLibraryDirectories="..\..\..\bin"
+				GenerateDebugInformation="true"
+				ProgramDatabaseFile="$(OutDir)\encConsole.pdb"
+				GenerateMapFile="true"
+				MapFileName="$(OutDir)\encConsole.map"
+				SubSystem="1"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)\encConsole.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\encConsole\Release"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
+			UseOfMFC="0"
+			ATLMinimizesCRunTimeLibraryUsage="false"
+			CharacterSet="2"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TypeLibraryName=".\..\..\..\..\..\bin\Release/encConsole.tlb"
+				HeaderFileName=""
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				InlineFunctionExpansion="1"
+				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;"
+				StringPooling="true"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				PrecompiledHeaderFile=".\..\..\..\obj\encConsole\Release/encConsole.pch"
+				AssemblerListingLocation=".\..\..\..\obj\encConsole\Release/"
+				ObjectFile=".\..\..\..\obj\encConsole\Release/"
+				ProgramDataBaseFileName=".\..\..\..\obj\encConsole\Release/"
+				WarningLevel="3"
+				SuppressStartupBanner="true"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+				PreprocessorDefinitions="NDEBUG"
+				Culture="1033"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalOptions="/LTCG"
+				AdditionalDependencies="$(OutDir)\welsenc.lib"
+				OutputFile="$(OutDir)\encConsole.exe"
+				LinkIncremental="1"
+				SuppressStartupBanner="true"
+				AdditionalLibraryDirectories="..\..\..\bin"
+				GenerateDebugInformation="true"
+				ProgramDatabaseFile="$(OutDir)\encConsole.pdb"
+				GenerateMapFile="false"
+				MapExports="false"
+				SubSystem="1"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+				SuppressStartupBanner="true"
+				OutputFile="$(OutDir)\encConsole.bsc"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
+			>
+			<File
+				RelativePath="..\..\..\console\enc\src\read_config.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\..\console\enc\src\welsenc.cpp"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+						AdditionalIncludeDirectories=""
+						PreprocessorDefinitions=""
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl"
+			>
+			<File
+				RelativePath="..\..\..\console\enc\inc\read_config.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
+			>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/build/win32/enc/encConsole_2010.vcxproj
@@ -1,0 +1,171 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{8509E2A8-2CBD-49E2-B564-3EFF1E927459}</ProjectGuid>
+    <RootNamespace>encConsole</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\encConsole\Debug\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\encConsole\Release\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">encConsole</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">encConsole</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Debug/encConsole.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encConsole\Debug/encConsole.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encConsole\Debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encConsole\Debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encConsole\Debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)\encConsole.exe</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\bin;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\encConsole.pdb</ProgramDatabaseFile>
+      <SubSystem>Console</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;.\..\..\..\..\bin\win32\Debug\welsenc.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>.\..\..\..\..\..\bin\Debug/encConsole.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Release/encConsole.tlb</TypeLibraryName>
+      <HeaderFileName>
+      </HeaderFileName>
+    </Midl>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encConsole\Release/encConsole.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encConsole\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encConsole\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encConsole\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalOptions>/LTCG %(AdditionalOptions)</AdditionalOptions>
+      <OutputFile>$(OutDir)\encConsole.exe</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\bin;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\encConsole.pdb</ProgramDatabaseFile>
+      <GenerateMapFile>false</GenerateMapFile>
+      <MapExports>false</MapExports>
+      <SubSystem>Console</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;.\..\..\..\..\bin\win32\Release\welsenc.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>.\..\..\..\..\..\bin\Release/encConsole.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\console\enc\src\read_config.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <ClCompile Include="..\..\..\console\enc\src\welsenc.cpp">
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\console\enc\inc\read_config.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="WelsEncPlus.vcxproj">
+      <Project>{1e7b4e9a-986e-4167-8c70-6e4f60eaee7f}</Project>
+      <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/build/win32/enc/encConsole_2012.vcxproj
@@ -1,0 +1,163 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{8509E2A8-2CBD-49E2-B564-3EFF1E927459}</ProjectGuid>
+    <RootNamespace>encConsole</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <UseOfMfc>false</UseOfMfc>
+    <CharacterSet>MultiByte</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+    <Import Project="$(VCTargetsPath)Microsoft.CPP.UpgradeFromVC60.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Debug</OutDir>
+    <IntDir>.\..\..\..\obj\encConsole\Debug</IntDir>
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>encConsole</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>.\..\..\..\..\bin\win32\Release</OutDir>
+    <IntDir>.\..\..\..\obj\encConsole\Release</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>encConsole</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Midl>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Debug/encConsole.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encConsole\Debug/encConsole.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encConsole\Debug/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encConsole\Debug/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encConsole\Debug/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <OutputFile>$(OutDir)\encConsole.exe</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\bin;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\encConsole.pdb</ProgramDatabaseFile>
+      <SubSystem>Console</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;$(OutDir)welsvp.lib;$(OutDir)welsenc.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MapFileName>$(OutDir)\encConsole.map</MapFileName>
+      <ProfileGuidedDatabase>$(OutDir)\encConsole.pgd</ProfileGuidedDatabase>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\encConsole.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Midl>
+      <TypeLibraryName>.\..\..\..\..\..\bin\Release/encConsole.tlb</TypeLibraryName>
+      <HeaderFileName />
+    </Midl>
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
+      <AdditionalIncludeDirectories>..\..\..\console\enc\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;..\..\..\encoder\core\inc;..\..\..\common\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;ENCODER_CORE;X86_ASM;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <StringPooling>true</StringPooling>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PrecompiledHeaderOutputFile>.\..\..\..\obj\encConsole\Release/encConsole.pch</PrecompiledHeaderOutputFile>
+      <AssemblerListingLocation>.\..\..\..\obj\encConsole\Release/</AssemblerListingLocation>
+      <ObjectFileName>.\..\..\..\obj\encConsole\Release/</ObjectFileName>
+      <ProgramDataBaseFileName>.\..\..\..\obj\encConsole\Release/</ProgramDataBaseFileName>
+      <WarningLevel>Level3</WarningLevel>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Culture>0x0409</Culture>
+    </ResourceCompile>
+    <Link>
+      <AdditionalOptions>/LTCG %(AdditionalOptions)</AdditionalOptions>
+      <OutputFile>$(OutDir)\encConsole.exe</OutputFile>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <AdditionalLibraryDirectories>..\..\..\bin;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <ProgramDatabaseFile>$(OutDir)\encConsole.pdb</ProgramDatabaseFile>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapExports>false</MapExports>
+      <SubSystem>Console</SubSystem>
+      <RandomizedBaseAddress>false</RandomizedBaseAddress>
+      <DataExecutionPrevention>true</DataExecutionPrevention>
+      <TargetMachine>MachineX86</TargetMachine>
+      <AdditionalDependencies>kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;$(OutDir)welsvp.lib;$(OutDir)welsenc.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <MapFileName>$(OutDir)\encConsole.map</MapFileName>
+      <ProfileGuidedDatabase>$(OutDir)\encConsole.pgd</ProfileGuidedDatabase>
+    </Link>
+    <Bscmake>
+      <SuppressStartupBanner>true</SuppressStartupBanner>
+      <OutputFile>$(OutDir)\encConsole.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\console\enc\src\read_config.cpp" />
+    <ClCompile Include="..\..\..\console\enc\src\welsenc.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\..\console\enc\inc\read_config.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="WelsEncPlus.vcxproj">
+      <Project>{1e7b4e9a-986e-4167-8c70-6e4f60eaee7f}</Project>
+      <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/console/dec/inc/d3d9_utils.h
@@ -1,0 +1,143 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	d3d9_utils.h
+ *
+ * \brief	interface of d3d9 render module
+ *
+ * \date	Created 12/14/2010
+ *
+ * \description : 1. Rendering in Vista and upper : D3D9Ex method, support host memory / shared surface input 
+ *                2. Rendering in XP : D3D9 method w/o device lost handling, support host memory input  
+ *                3. File Dump : support host memory / shared surface input 
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_D3D9_UTILS_H__
+#define WELS_D3D9_UTILS_H__
+
+//#pragma once	// do not use this due cross platform, esp for Solaris
+
+#include <stdio.h>
+#include "codec_def.h"
+
+#if defined(_MSC_VER) && (_MSC_VER>=1500) // vs2008 and upper
+#define ENABLE_DISPLAY_MODULE // enable/disable the render feature 
+#endif
+
+#ifdef ENABLE_DISPLAY_MODULE
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#include <d3d9.h>
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+class CD3D9Utils
+{
+public:
+	CD3D9Utils();
+	~CD3D9Utils();
+
+public:
+	HRESULT Init(BOOL bWindowed);
+	HRESULT Uninit(void);
+	HRESULT Process(void *pDst[3], SBufferInfo *Info, FILE *pFile = NULL);
+
+private:
+	HRESULT InitResource(void *pSharedHandle, SBufferInfo *pInfo);
+	HRESULT Render(void *pDst[3], SBufferInfo *pInfo);
+	HRESULT Dump(void *pDst[3], SBufferInfo *pInfo, FILE *pFile);
+                  
+private:
+	HMODULE               m_hDll;
+	HWND                  m_hWnd;
+	unsigned char        *m_pDumpYUV;
+	BOOL                  m_bInitDone;
+
+	LPDIRECT3D9           m_lpD3D9;
+	LPDIRECT3DDEVICE9     m_lpD3D9Device;
+
+	D3DPRESENT_PARAMETERS m_d3dpp;
+	LPDIRECT3DSURFACE9    m_lpD3D9RawSurfaceShare;
+};
+
+class CD3D9ExUtils
+{
+public:
+	CD3D9ExUtils();
+	~CD3D9ExUtils();
+
+public:
+	HRESULT Init(BOOL bWindowed);
+	HRESULT Uninit(void);
+	HRESULT Process(void *dst[3], SBufferInfo *Info, FILE *fp = NULL);
+
+private:
+	HRESULT InitResource(void *pSharedHandle, SBufferInfo *Info);
+	HRESULT Render(void *pDst[3], SBufferInfo *Info);
+	HRESULT Dump(void *pDst[3], SBufferInfo *Info, FILE *fp);
+
+private:
+	HMODULE               m_hDll;
+	HWND                  m_hWnd;
+	unsigned char        *m_pDumpYUV;
+	BOOL                  m_bInitDone;
+
+	LPDIRECT3D9EX         m_lpD3D9;
+	LPDIRECT3DDEVICE9EX   m_lpD3D9Device;
+
+	D3DPRESENT_PARAMETERS m_d3dpp;
+	LPDIRECT3DSURFACE9    m_lpD3D9RawSurfaceShare;
+};
+#endif
+
+typedef enum
+{
+  OS_UNSUPPORTED = 0,
+  OS_XP,
+  OS_VISTA_UPPER
+};
+
+class CUtils
+{
+public:
+	CUtils();
+	~CUtils();
+
+	int Process(void *dst[3], SBufferInfo *Info, FILE *fp);
+
+private:
+	int CheckOS(void);
+
+private:
+	int iOSType;
+	void *hHandle;
+};
+
+#endif//WELS_D3D9_UTILS_H__
+
--- /dev/null
+++ b/codec/console/dec/inc/dec_console.h
@@ -1,0 +1,53 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  dec_console.h
+ *  h264decConsole
+ *
+ *  Created on 11-3-15.
+ *
+ */
+#pragma once
+
+#include "code_api.h"
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+
+bool load_bundle_welsdec();
+void free_bundle_welsdec();
+bool get_functions_address_free_decoder(ISVCDecoder* pDecoder);
+bool get_functions_address_create_decoder(ISVCDecoder** ppDecoder);
+
+
+
--- /dev/null
+++ b/codec/console/dec/inc/read_config.h
@@ -1,0 +1,66 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  read_config.h
+ *
+ *  Abstract
+ *      Class for reading parameter settings in a configure file.
+ *
+ *  History
+ *      08/18/2008 Created
+ *
+ *****************************************************************************/
+#ifndef READ_CONFIG_H__
+#define READ_CONFIG_H__
+
+#include <stdlib.h>
+#include <string>
+using namespace std;
+
+class CReadConfig
+{
+public:
+	CReadConfig( const char *kpConfigFileName );
+	virtual ~CReadConfig();
+	
+	long ReadLine( string* val, const int kiValSize = 4 );
+	const bool EndOfFile();
+	const int GetLines();
+	const bool ExistFile();
+	const string& GetFileName();
+	
+private:
+	FILE			*m_pCfgFile;
+	string			m_strCfgFileName;
+	unsigned long	m_ulLines;
+};
+
+#endif	// READ_CONFIG_H__
+
--- /dev/null
+++ b/codec/console/dec/src/d3d9_utils.cpp
@@ -1,0 +1,778 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "d3d9_utils.h"
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+void Write2File(FILE *pFp, unsigned char* pData[3], int iStride[2], int iWidth, int iHeight);
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#ifdef ENABLE_DISPLAY_MODULE
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#define IDM_ABOUT						104
+#define IDM_EXIT						105
+#define IDI_TESTSHARESURFACE	        107
+#define IDI_SMALL						108
+#define IDC_TESTSHARESURFACE	        109
+
+#define NV12_FORMAT  MAKEFOURCC('N','V','1','2')
+
+typedef struct
+{
+	UINT      uiWidth;
+	UINT      uiHeight;
+	D3DFORMAT D3Dformat;
+	D3DPOOL   D3DPool;
+} SHandleInfo;
+
+#define SAFE_RELEASE(p) if(p) { (p)->Release(); (p) = NULL; }
+#define SAFE_FREE(p)    if(p) { free (p); (p) = NULL; }
+
+HRESULT Dump2YUV(void *pDst[3], void *pSurface, int iWidth, int iHeight, int iStride[2]);
+HRESULT Dump2Surface(void *pDst[3], void *pSurface, int iWidth, int iHeight, int iStride[2]);
+HRESULT InitWindow(HWND *hWnd);
+LRESULT CALLBACK WndProc(HWND, UINT, WPARAM, LPARAM);
+
+typedef HRESULT (WINAPI *pFnCreateD3D9Ex) (UINT SDKVersion, IDirect3D9Ex** );
+typedef LPDIRECT3D9 (WINAPI *pFnCreateD3D9)(UINT SDKVersion);
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CD3D9Utils::CD3D9Utils()
+{
+	m_hDll        = NULL;
+	m_hWnd        = NULL;
+	m_pDumpYUV    = NULL;
+
+	m_bInitDone   = FALSE;
+
+	m_lpD3D9                = NULL;
+	m_lpD3D9Device          = NULL;
+	m_lpD3D9RawSurfaceShare = NULL;
+
+	// coverity scan uninitial
+	ZeroMemory(&m_d3dpp, sizeof(m_d3dpp));
+}
+
+CD3D9Utils::~CD3D9Utils()
+{
+	Uninit();
+}
+
+HRESULT CD3D9Utils::Init(BOOL bWindowed)
+{
+	if (m_bInitDone)
+		return S_OK;
+
+	m_hDll = LoadLibrary(TEXT("d3d9.dll"));
+	pFnCreateD3D9 pCreateD3D9 = NULL;
+	if(m_hDll)
+		pCreateD3D9 = (pFnCreateD3D9) GetProcAddress(m_hDll, TEXT("Direct3DCreate9"));
+	else 
+		return E_FAIL;
+
+	m_lpD3D9 = pCreateD3D9(D3D_SDK_VERSION);
+
+	return bWindowed ? InitWindow(&m_hWnd) : S_OK;
+}
+
+HRESULT CD3D9Utils::Uninit()
+{
+	SAFE_RELEASE(m_lpD3D9RawSurfaceShare);
+    SAFE_RELEASE(m_lpD3D9Device);
+    SAFE_RELEASE(m_lpD3D9);
+	SAFE_FREE(m_pDumpYUV);
+
+	if(m_hDll)
+	{
+		FreeLibrary(m_hDll);
+		m_hDll = NULL;
+	}
+
+	return S_OK;
+}
+
+HRESULT CD3D9Utils::Process(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
+{
+	HRESULT hResult = E_FAIL;
+
+	if (pDst == NULL || pInfo == NULL)
+		return hResult;
+
+	BOOL bWindowed = pFp ? FALSE : TRUE;
+	BOOL bNeedD3D9 = !(!bWindowed && pInfo->eBufferProperty == BUFFER_HOST);
+	if (!m_bInitDone)
+		m_bInitDone = !bNeedD3D9;
+
+	if (!m_bInitDone)
+	{
+		hResult = Init(bWindowed);
+		if (SUCCEEDED(hResult))
+			m_bInitDone = TRUE;
+	}
+
+	if (m_bInitDone)
+	{
+		if (bWindowed)
+		{	
+			hResult = Render(pDst, pInfo);				
+			Sleep(30); 
+		}
+		else if (pFp)
+		{
+			hResult = Dump(pDst, pInfo, pFp);
+			Sleep(0);
+		}
+	}
+
+	return hResult;
+}
+
+HRESULT CD3D9Utils::Render(void *pDst[3], SBufferInfo *pInfo)
+{
+	HRESULT hResult = E_FAIL;
+	EBufferProperty eBufferProperty = pInfo->eBufferProperty;
+
+	if (eBufferProperty == BUFFER_HOST)
+	{
+		hResult = InitResource(NULL, pInfo);
+		if (SUCCEEDED(hResult))
+		 hResult = Dump2Surface(pDst, m_lpD3D9RawSurfaceShare, pInfo->UsrData.sSystemBuffer.iWidth, pInfo->UsrData.sSystemBuffer.iHeight, pInfo->UsrData.sSystemBuffer.iStride);
+	}
+	
+	if (SUCCEEDED(hResult))
+	{
+		IDirect3DSurface9 *pBackBuffer = NULL;
+		hResult = m_lpD3D9Device->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &pBackBuffer);
+		hResult = m_lpD3D9Device->StretchRect(m_lpD3D9RawSurfaceShare, NULL, pBackBuffer, NULL, D3DTEXF_NONE);
+		hResult = m_lpD3D9Device->Present(0, 0, NULL, NULL);
+	}
+
+	return hResult;
+}
+
+HRESULT CD3D9Utils::Dump(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
+{
+	HRESULT hResult = E_FAIL;
+	EBufferProperty eBufferProperty = pInfo->eBufferProperty;
+	int iStride[2];
+	int iWidth;
+	int iHeight;	
+
+	iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+	iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+	iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
+	iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
+	
+	if (pDst[0] && pDst[1] && pDst[2])
+		Write2File(pFp, (unsigned char **)pDst, iStride, iWidth, iHeight);
+
+	return hResult;
+}
+
+HRESULT CD3D9Utils::InitResource(void *pSharedHandle, SBufferInfo *pInfo)
+{
+	HRESULT hResult = S_OK;
+
+	// coverity scan uninitial
+	int iWidth = 0;
+	int iHeight = 0;
+	D3DFORMAT D3Dformat = (D3DFORMAT)D3DFMT_UNKNOWN;
+	D3DPOOL D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
+
+	if (pInfo == NULL)
+		return E_FAIL;
+
+	if (m_lpD3D9Device == NULL && m_lpD3D9RawSurfaceShare == NULL)
+	{
+		HMONITOR hMonitorWnd = MonitorFromWindow(m_hWnd, MONITOR_DEFAULTTONULL);
+
+		UINT uiAdapter = D3DADAPTER_DEFAULT;
+		UINT uiCnt = m_lpD3D9->GetAdapterCount();
+		for(UINT i=0; i<uiCnt; i++)
+		{
+			HMONITOR hMonitor = m_lpD3D9->GetAdapterMonitor(i);
+			if(hMonitor == hMonitorWnd)
+			{
+				uiAdapter = i;
+				break;
+			}
+		}
+
+		D3DDISPLAYMODE D3DDisplayMode;
+		hResult = m_lpD3D9->GetAdapterDisplayMode(uiAdapter, &D3DDisplayMode);
+
+		D3DDEVTYPE D3DDevType = D3DDEVTYPE_HAL;
+		DWORD dwBehaviorFlags = D3DCREATE_SOFTWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED;
+
+		ZeroMemory(&m_d3dpp, sizeof(m_d3dpp));
+		m_d3dpp.Flags = D3DPRESENTFLAG_VIDEO;
+		m_d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
+		m_d3dpp.BackBufferFormat = D3DDisplayMode.Format;
+		m_d3dpp.Windowed = TRUE;
+		m_d3dpp.hDeviceWindow = m_hWnd;
+		m_d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
+		hResult = m_lpD3D9->CreateDevice(uiAdapter, D3DDevType, NULL, dwBehaviorFlags, &m_d3dpp, &m_lpD3D9Device);
+		if (pInfo->eBufferProperty == BUFFER_HOST)
+		{
+			iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+			iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+			D3Dformat = (D3DFORMAT)NV12_FORMAT;
+			D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
+		}
+		
+		hResult = m_lpD3D9Device->CreateOffscreenPlainSurface(iWidth, iHeight, (D3DFORMAT)D3Dformat, (D3DPOOL)D3Dpool, &m_lpD3D9RawSurfaceShare, NULL);
+		
+	}
+
+	if (m_lpD3D9Device == NULL || m_lpD3D9RawSurfaceShare == NULL)
+		hResult = E_FAIL;
+
+	return hResult;
+}
+
+CD3D9ExUtils::CD3D9ExUtils()
+{
+	m_hDll        = NULL;
+	m_hWnd        = NULL;
+	m_pDumpYUV    = NULL;
+
+	m_bInitDone   = FALSE;
+
+	m_lpD3D9                = NULL;
+	m_lpD3D9Device          = NULL;
+	m_lpD3D9RawSurfaceShare = NULL;
+
+	// coverity scan uninitial
+	ZeroMemory(&m_d3dpp, sizeof(m_d3dpp));
+}
+
+CD3D9ExUtils::~CD3D9ExUtils()
+{
+	Uninit();
+}
+
+HRESULT CD3D9ExUtils::Init(BOOL bWindowed)
+{
+	if (m_bInitDone)
+		return S_OK;
+
+	m_hDll = LoadLibrary(TEXT("d3d9.dll"));
+	pFnCreateD3D9Ex pCreateD3D9Ex = NULL;
+	if(m_hDll)
+		pCreateD3D9Ex = (pFnCreateD3D9Ex) GetProcAddress(m_hDll, TEXT("Direct3DCreate9Ex"));
+	else 
+		return E_FAIL;
+
+	pCreateD3D9Ex(D3D_SDK_VERSION, &m_lpD3D9);
+
+	return bWindowed ? InitWindow(&m_hWnd) : S_OK;
+}
+
+HRESULT CD3D9ExUtils::Uninit()
+{
+	SAFE_RELEASE(m_lpD3D9RawSurfaceShare);
+	SAFE_RELEASE(m_lpD3D9Device);
+	SAFE_RELEASE(m_lpD3D9);
+	SAFE_FREE(m_pDumpYUV);
+
+	if(m_hDll)
+	{
+		FreeLibrary(m_hDll);
+		m_hDll = NULL;
+	}
+
+	return S_OK;
+}
+
+HRESULT CD3D9ExUtils::Process(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
+{
+	HRESULT hResult = E_FAIL;
+
+	if (pDst == NULL || pInfo == NULL)
+		return hResult;
+
+	BOOL bWindowed = pFp ? FALSE : TRUE;
+	BOOL bNeedD3D9 = !(!bWindowed && pInfo->eBufferProperty == BUFFER_HOST);
+	if (!m_bInitDone)
+		m_bInitDone = !bNeedD3D9;
+
+	if (!m_bInitDone)
+	{
+		hResult = Init(bWindowed);
+		if (SUCCEEDED(hResult))
+			m_bInitDone = TRUE;
+	}
+
+	if (m_bInitDone)
+	{
+		if (bWindowed)
+		{	
+			hResult = Render(pDst, pInfo);				
+			Sleep(30); // set a simple time controlling with default of 30fps
+		}
+		else if (pFp)
+		{
+			hResult = Dump(pDst, pInfo, pFp);
+			Sleep(0);
+		}
+	}
+
+ 	return hResult;
+}
+
+HRESULT CD3D9ExUtils::Render(void *pDst[3], SBufferInfo *pInfo)
+{
+	HRESULT hResult = E_FAIL;
+	EBufferProperty eBufferProperty = pInfo->eBufferProperty;
+
+	if (eBufferProperty == BUFFER_HOST)
+	{
+		hResult = InitResource(NULL, pInfo);
+		if (SUCCEEDED(hResult))
+			hResult = Dump2Surface(pDst, m_lpD3D9RawSurfaceShare, pInfo->UsrData.sSystemBuffer.iWidth, pInfo->UsrData.sSystemBuffer.iHeight, pInfo->UsrData.sSystemBuffer.iStride);
+	}
+	else if (eBufferProperty == BUFFER_DEVICE)
+	{
+		VOID * pSharedHandle = pDst[0];	
+		hResult = InitResource(pSharedHandle, pInfo);
+	}
+
+	if (SUCCEEDED(hResult))
+	{
+		IDirect3DSurface9 *pBackBuffer = NULL;
+		hResult = m_lpD3D9Device->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &pBackBuffer);
+		hResult = m_lpD3D9Device->StretchRect(m_lpD3D9RawSurfaceShare, NULL, pBackBuffer, NULL, D3DTEXF_NONE);
+		hResult = m_lpD3D9Device->PresentEx(0, 0, NULL, NULL, 0);
+	}
+
+	return hResult;
+}
+
+HRESULT CD3D9ExUtils::Dump(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
+{
+	HRESULT hResult = E_FAIL;
+	EBufferProperty eBufferProperty = pInfo->eBufferProperty;
+	int iStride[2];
+	int iWidth;
+	int iHeight;	
+	
+	if (eBufferProperty != BUFFER_HOST)
+	{		
+		iWidth = pInfo->UsrData.sVideoBuffer.iSurfaceWidth;
+		iHeight = pInfo->UsrData.sVideoBuffer.iSurfaceHeight;
+		iStride[0] = iWidth;
+		iStride[1] = iWidth / 2;
+		
+		if (m_pDumpYUV == NULL)
+		{
+			m_pDumpYUV = (unsigned char *)malloc(iWidth * iHeight * 3 / 2 * sizeof(unsigned char));
+		}
+
+		if (m_pDumpYUV)
+		{
+			void *pSurface = pDst[1];
+			pDst[0] = m_pDumpYUV;
+			pDst[1] = m_pDumpYUV + iHeight * iStride[0] * sizeof(unsigned char);
+			pDst[2] = m_pDumpYUV + iHeight * iStride[0] * 5 / 4 * sizeof(unsigned char);
+			hResult = Dump2YUV(pDst, pSurface, iWidth, iHeight, iStride);
+		}
+	}
+	else
+	{
+		iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+		iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+		iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
+		iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
+	}
+	
+	if (pDst[0] && pDst[1] && pDst[2])
+		Write2File(pFp, (unsigned char **)pDst, iStride, iWidth, iHeight);
+
+	return hResult;
+}
+
+HRESULT CD3D9ExUtils::InitResource(void *pSharedHandle, SBufferInfo *pInfo)
+{
+	HRESULT hResult = S_OK;
+	int iWidth;
+	int iHeight;
+	D3DFORMAT D3Dformat;
+	D3DPOOL D3Dpool;
+
+	if (pInfo == NULL)
+		return E_FAIL;
+
+	if (m_lpD3D9Device == NULL && m_lpD3D9RawSurfaceShare == NULL)
+	{
+		HMONITOR hMonitorWnd = MonitorFromWindow(m_hWnd, MONITOR_DEFAULTTONULL);
+
+		UINT uiAdapter = D3DADAPTER_DEFAULT;
+		UINT uiCnt = m_lpD3D9->GetAdapterCount();
+		for(UINT i=0; i<uiCnt; i++)
+		{
+			HMONITOR hMonitor = m_lpD3D9->GetAdapterMonitor(i);
+			if(hMonitor == hMonitorWnd)
+			{
+				uiAdapter = i;
+				break;
+			}
+		}
+
+		D3DDISPLAYMODEEX D3DDisplayMode;
+		D3DDisplayMode.Size = sizeof(D3DDISPLAYMODEEX);
+		hResult = m_lpD3D9->GetAdapterDisplayModeEx(uiAdapter, &D3DDisplayMode, NULL);
+
+		D3DDEVTYPE D3DDevType = D3DDEVTYPE_HAL;
+		DWORD dwBehaviorFlags = D3DCREATE_SOFTWARE_VERTEXPROCESSING | D3DCREATE_MULTITHREADED;
+
+		ZeroMemory(&m_d3dpp, sizeof(m_d3dpp));
+		m_d3dpp.Flags = D3DPRESENTFLAG_VIDEO;
+		m_d3dpp.SwapEffect = D3DSWAPEFFECT_DISCARD;
+		m_d3dpp.BackBufferFormat = D3DDisplayMode.Format;
+		m_d3dpp.Windowed = TRUE;
+		m_d3dpp.hDeviceWindow = m_hWnd;
+		m_d3dpp.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE;
+		hResult = m_lpD3D9->CreateDeviceEx(uiAdapter, D3DDevType, NULL, dwBehaviorFlags, &m_d3dpp, NULL, &m_lpD3D9Device);
+		if (pInfo->eBufferProperty == BUFFER_HOST)
+		{
+			iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+			iHeight = pInfo->UsrData.sSystemBuffer.iHeight;
+			D3Dformat = (D3DFORMAT)NV12_FORMAT;
+			D3Dpool = (D3DPOOL)D3DPOOL_DEFAULT;
+		}
+		else
+		{
+			iWidth = pInfo->UsrData.sVideoBuffer.iSurfaceWidth;
+			iHeight = pInfo->UsrData.sVideoBuffer.iSurfaceHeight;
+			D3Dformat = (D3DFORMAT)pInfo->UsrData.sVideoBuffer.D3Dformat;
+			D3Dpool = (D3DPOOL)pInfo->UsrData.sVideoBuffer.D3DPool;
+		}
+		hResult = m_lpD3D9Device->CreateOffscreenPlainSurface(iWidth, iHeight, (D3DFORMAT)D3Dformat, (D3DPOOL)D3Dpool, &m_lpD3D9RawSurfaceShare, &pSharedHandle);
+	}
+
+	if (m_lpD3D9Device == NULL || m_lpD3D9RawSurfaceShare == NULL)
+		hResult = E_FAIL;
+
+	return hResult;
+}
+
+
+HRESULT Dump2YUV(void *pDst[3], void *pSurface, int iWidth, int iHeight, int iStride[2])
+{
+	HRESULT hResult = E_FAIL;
+
+	if (!pDst[0] || !pDst[1] || !pDst[2] || !pSurface)
+		return hResult;
+
+	IDirect3DSurface9 *pSurfaceData = (IDirect3DSurface9 *)pSurface;
+	D3DLOCKED_RECT sD3DLockedRect = {0};
+	hResult = pSurfaceData->LockRect(&sD3DLockedRect, NULL, 0);
+
+	unsigned char * pInY = (unsigned char *)sD3DLockedRect.pBits;
+	unsigned char * pOutY = (unsigned char *)pDst[0];
+	int iInStride = sD3DLockedRect.Pitch;
+	int iOutStride = iStride[0];
+
+	for (int j=0; j<iHeight; j++)
+		memcpy(pOutY+j*iOutStride, pInY+j*iInStride, iWidth);//confirmed_safe_unsafe_usage
+
+	unsigned char * pOutV = (unsigned char *)pDst[1];
+	unsigned char * pOutU = (unsigned char *)pDst[2];
+	unsigned char * pInC = pInY + iInStride * iHeight;
+	iOutStride = iStride[1];
+	for (int i=0; i<iHeight/2; i++)
+	{
+		for (int j=0; j<iWidth; j+=2)
+		{
+			pOutV[i*iOutStride+j/2] = pInC[i*iInStride+j  ];
+			pOutU[i*iOutStride+j/2] = pInC[i*iInStride+j+1];
+		}
+	}
+
+	pSurfaceData->UnlockRect();
+
+	return hResult;
+}
+
+HRESULT Dump2Surface(void *pDst[3], void *pSurface, int iWidth, int iHeight, int iStride[2])
+{
+	HRESULT hResult = E_FAIL;
+
+	if (!pDst[0] || !pDst[1] || !pDst[2] || !pSurface)
+		return hResult;
+
+	IDirect3DSurface9 *pSurfaceData = (IDirect3DSurface9 *)pSurface;
+	D3DLOCKED_RECT sD3DLockedRect = {0};
+	hResult = pSurfaceData->LockRect(&sD3DLockedRect, NULL, 0);
+
+	unsigned char * pInY = (unsigned char *)pDst[0];
+	unsigned char * pOutY = (unsigned char *)sD3DLockedRect.pBits;
+	int iOutStride = sD3DLockedRect.Pitch;
+
+	for (int j=0; j<iHeight; j++)
+		memcpy(pOutY+j*iOutStride, pInY+j*iStride[0], iWidth);//confirmed_safe_unsafe_usage
+	
+	unsigned char * pInV = (unsigned char *)pDst[1];
+	unsigned char * pInU = (unsigned char *)pDst[2];
+	unsigned char * pOutC = pOutY + iOutStride * iHeight;
+	for (int i=0; i<iHeight/2; i++)
+	{
+		for (int j=0; j<iWidth; j+=2)
+		{
+			pOutC[i*iOutStride+j  ] = pInV[i*iStride[1]+j/2];
+			pOutC[i*iOutStride+j+1] = pInU[i*iStride[1]+j/2];
+		}
+	}
+
+	pSurfaceData->UnlockRect();
+
+	return hResult;
+}
+
+HRESULT InitWindow(HWND *hWnd)
+{
+	const TCHAR kszWindowTitle[] = TEXT("Wels Decoder Application");
+	const TCHAR kszWindowClass[] = TEXT("Wels Decoder Class");
+
+	WNDCLASSEX sWndClassEx = {0};
+	sWndClassEx.cbSize          = sizeof(WNDCLASSEX); 
+	sWndClassEx.style			= CS_HREDRAW | CS_VREDRAW;
+	sWndClassEx.lpfnWndProc	    = (WNDPROC)WndProc;
+	sWndClassEx.cbClsExtra		= 0;
+	sWndClassEx.cbWndExtra		= 0;
+	sWndClassEx.hInstance		= GetModuleHandle(NULL);
+	sWndClassEx.hIcon			= LoadIcon(sWndClassEx.hInstance, (LPCTSTR)IDI_TESTSHARESURFACE);
+	sWndClassEx.hCursor		    = LoadCursor(NULL, IDC_ARROW);
+	sWndClassEx.hbrBackground	= (HBRUSH)(COLOR_WINDOW + 1);
+	sWndClassEx.lpszMenuName	= (LPCSTR)IDC_TESTSHARESURFACE;
+	sWndClassEx.lpszClassName	= kszWindowClass;
+	sWndClassEx.hIconSm		    = LoadIcon(sWndClassEx.hInstance, (LPCTSTR)IDI_SMALL);
+
+	if (!RegisterClassEx(&sWndClassEx))
+		return E_FAIL;
+
+	HWND hTmpWnd = CreateWindow(kszWindowClass, kszWindowTitle, WS_OVERLAPPEDWINDOW,
+		CW_USEDEFAULT, 0, CW_USEDEFAULT, 0, NULL, NULL, sWndClassEx.hInstance, NULL);
+
+    *hWnd = hTmpWnd;
+	if (!hTmpWnd)
+		return E_FAIL;
+
+	ShowWindow(hTmpWnd, SW_SHOWDEFAULT);
+	UpdateWindow(hTmpWnd);
+
+	return S_OK;
+}
+
+LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+	INT wmId, wmEvent;
+
+	switch (message) 
+	{
+	case WM_COMMAND:
+		wmId    = LOWORD(wParam); 
+		wmEvent = HIWORD(wParam); 
+		switch (wmId)
+		{
+		case IDM_ABOUT:
+			break;
+		case IDM_EXIT:
+			DestroyWindow(hWnd);
+			break;
+		default:
+			return DefWindowProc(hWnd, message, wParam, lParam);
+		}
+		break;
+	case WM_PAINT:
+		ValidateRect(hWnd , NULL);
+		break;
+	case WM_DESTROY:
+		PostQuitMessage(0);
+		break;
+	default:
+		return DefWindowProc(hWnd, message, wParam, lParam);
+	}
+	return 0;
+}
+
+#endif
+
+CUtils::CUtils()
+{
+	hHandle = NULL;
+	iOSType = CheckOS();
+
+#ifdef ENABLE_DISPLAY_MODULE
+	if (iOSType == OS_XP)
+		hHandle = (void *) new CD3D9Utils;
+
+	else if (iOSType == OS_VISTA_UPPER)
+		hHandle = (void *) new CD3D9ExUtils;
+#endif
+
+	if (hHandle == NULL)
+		iOSType = OS_UNSUPPORTED;
+}
+
+CUtils::~CUtils()
+{
+#ifdef ENABLE_DISPLAY_MODULE
+	if (hHandle)
+	{
+		if (iOSType == OS_XP)
+		{
+			CD3D9Utils *hTmp = (CD3D9Utils *) hHandle;
+		    delete hTmp;
+		}
+		else if (iOSType == OS_VISTA_UPPER)
+		{
+			CD3D9ExUtils *hTmp = (CD3D9ExUtils *) hHandle;
+			delete hTmp;
+		}
+		hHandle = NULL;
+	}
+#endif
+}
+
+int CUtils::Process(void *pDst[3], SBufferInfo *pInfo, FILE *pFp)
+{
+	
+	int iRet = 0;
+
+	if (iOSType == OS_UNSUPPORTED)
+	{
+		if (pFp && pDst[0] && pDst[1] && pDst[2] && pInfo)
+		{
+			int iStride[2];
+			int iWidth = pInfo->UsrData.sSystemBuffer.iWidth;
+			int iHeight= pInfo->UsrData.sSystemBuffer.iHeight;
+			iStride[0] = pInfo->UsrData.sSystemBuffer.iStride[0];
+			iStride[1] = pInfo->UsrData.sSystemBuffer.iStride[1];
+
+			Write2File(pFp, (unsigned char **)pDst, iStride, iWidth, iHeight);
+		}
+	}
+
+#ifdef ENABLE_DISPLAY_MODULE
+	else
+	{
+		MSG msg;
+		ZeroMemory( &msg, sizeof(msg) );
+		while( msg.message != WM_QUIT )
+		{
+			if( PeekMessage( &msg, NULL, 0U, 0U, PM_REMOVE ) )
+			{
+				TranslateMessage( &msg );
+				DispatchMessage( &msg );
+			}
+			else
+			{
+				HRESULT hResult = S_OK;
+				if (iOSType == OS_XP)
+					hResult = ((CD3D9Utils *)hHandle)->Process(pDst, pInfo, pFp);
+
+				else if (iOSType == OS_VISTA_UPPER)
+					hResult = ((CD3D9ExUtils *)hHandle)->Process(pDst, pInfo, pFp);
+              
+				iRet = !SUCCEEDED(hResult);
+				break;
+			}		
+		}
+	}	
+#endif
+
+	return iRet;
+}
+
+int CUtils::CheckOS()
+{
+	int iType = OS_UNSUPPORTED;
+
+#ifdef ENABLE_DISPLAY_MODULE
+	OSVERSIONINFOEX osvi;
+	ZeroMemory(&osvi, sizeof(OSVERSIONINFOEX));
+	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFOEX);
+
+	if( !GetVersionEx ((OSVERSIONINFO *) &osvi) )
+	{
+		osvi.dwOSVersionInfoSize = sizeof (OSVERSIONINFO);
+		if (! GetVersionEx ( (OSVERSIONINFO *) &osvi) ) 
+			return iType;
+	}
+
+	switch (osvi.dwPlatformId)
+	{
+	case VER_PLATFORM_WIN32_NT:	
+		if (osvi.dwMajorVersion >= 6)
+			iType = OS_VISTA_UPPER;
+		else if (osvi.dwMajorVersion == 5)
+			iType = OS_XP;
+		break;		
+
+	default:
+		break;
+	}
+#endif
+
+	return iType;
+}
+
+void Write2File(FILE *pFp, unsigned char* pData[3], int iStride[2], int iWidth, int iHeight)
+{
+	int   i;
+	unsigned char  *pPtr = NULL;
+
+	pPtr = pData[0];
+	for( i=0; i<iHeight; i++ )
+	{
+		fwrite(pPtr, 1, iWidth, pFp);
+		pPtr += iStride[0];
+	}
+
+	iHeight = iHeight/2;
+	iWidth = iWidth/2;
+	pPtr = pData[1];
+	for( i=0; i<iHeight; i++ )
+	{
+		fwrite(pPtr, 1, iWidth, pFp);
+		pPtr += iStride[1];
+	}
+
+	pPtr = pData[2];
+	for( i=0; i<iHeight; i++ )
+	{
+		fwrite(pPtr, 1, iWidth, pFp);
+		pPtr += iStride[1];
+	}
+}
--- /dev/null
+++ b/codec/console/dec/src/h264dec.cpp
@@ -1,0 +1,547 @@
+/*!
+ * \copy
+ *     Copyright (c)  2004-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * h264dec.cpp:		Wels Decoder Console Implementation file
+ */
+
+#if defined (WIN32)
+#include <windows.h>
+#include <tchar.h>
+#else
+#include <string.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "codec_def.h"
+#include "codec_app_def.h"
+#include "codec_api.h"
+#include "read_config.h"
+#include "../../decoder/core/inc/typedefs.h"
+#include "../../decoder/core/inc/measure_time.h"
+#include "d3d9_utils.h"
+
+typedef long   (*PCreateDecoderFunc) (ISVCDecoder** ppDecoder);
+typedef void_t (*PDestroyDecoderFunc)(ISVCDecoder* pDecoder);
+
+
+#if defined(__APPLE__)
+#include "dec_console.h"
+#endif
+using namespace std;
+
+//using namespace WelsDec;
+
+//#define STICK_STREAM_SIZE	// For Demo interfaces test with track file of integrated frames
+
+void_t H264DecodeInstance( ISVCDecoder* pDecoder, const char* kpH264FileName, const char* kpOuputFileName, int32_t& iWidth, int32_t& iHeight, void_t* pOptionFileName )
+{
+	FILE *pH264File	  = NULL;
+	FILE *pYuvFile	  = NULL;
+	FILE *pOptionFile = NULL;
+	int64_t iStart = 0, iEnd = 0, iTotal = 0;
+	int32_t iSliceSize;
+	int32_t iSliceIndex = 0;
+	uint8_t* pBuf = NULL;
+	uint8_t uiStartCode[4] = {0, 0, 0, 1};
+
+	void_t *pData[3] = {NULL};
+	uint8_t *pDst[3] = {NULL};
+	SBufferInfo sDstBufInfo;
+
+	int32_t iBufPos = 0;
+	int32_t iFileSize;
+	int32_t i = 0;
+	int32_t iLastWidth = 0, iLastHeight = 0;
+	int32_t iFrameCount = 0;
+	int32_t iEndOfStreamFlag = 0;
+	int32_t iColorFormat = videoFormatInternal;
+	static int32_t iFrameNum = 0;
+
+	EDecodeMode     eDecoderMode    = AUTO_MODE;
+	EBufferProperty	eOutputProperty = BUFFER_DEVICE;
+	
+	CUtils cOutputModule;
+	double dElapsed = 0;
+
+	if (pDecoder == NULL) return;	
+	if (kpH264FileName)
+	{
+		pH264File = fopen(kpH264FileName,"rb");
+		if (pH264File == NULL){
+			fprintf(stderr, "Can not open h264 source file, check its legal path related please..\n");
+			return;
+		}
+		fprintf(stderr, "H264 source file name: %s..\n",kpH264FileName);
+	}
+	else
+	{
+		fprintf(stderr, "Can not find any h264 bitstream file to read..\n");
+		fprintf(stderr, "----------------decoder return------------------------\n" );
+		return;
+	}
+
+	if (kpOuputFileName){
+		pYuvFile = fopen(kpOuputFileName, "wb");
+		if (pYuvFile == NULL){
+			fprintf(stderr, "Can not open yuv file to output result of decoding..\n");
+			// any options
+			//return;	// can let decoder work in quiet mode, no writing any output
+		}
+		else
+			fprintf(stderr, "Sequence output file name: %s..\n", kpOuputFileName);
+	}
+	else{
+		fprintf(stderr, "Can not find any output file to write..\n");
+		// any options
+	}
+	
+	if (pOptionFileName){
+		pOptionFile = fopen((char*)pOptionFileName, "wb");
+		if ( pOptionFile == NULL ){
+			fprintf(stderr, "Can not open optional file for write..\n");
+		}
+		else
+			fprintf(stderr, "Extra optional file: %s..\n", (char*)pOptionFileName);
+	}
+
+	printf( "------------------------------------------------------\n" );
+
+	fseek(pH264File, 0L, SEEK_END);
+	iFileSize = ftell(pH264File);
+	if (iFileSize<=0) {
+		fprintf(stderr, "Current Bit Stream File is too small, read error!!!!\n");
+		goto label_exit;
+	}
+	fseek(pH264File, 0L, SEEK_SET);
+
+	pBuf = new uint8_t[iFileSize+4];
+	if (pBuf == NULL){
+		fprintf(stderr, "new buffer failed!\n");
+		goto label_exit;
+	}
+
+	fread(pBuf, 1, iFileSize, pH264File);
+	memcpy(pBuf+iFileSize, &uiStartCode[0], 4);//confirmed_safe_unsafe_usage
+
+	if( pDecoder->SetOption( DECODER_OPTION_DATAFORMAT,  &iColorFormat ) ){
+		fprintf(stderr, "SetOption() failed, opt_id : %d  ..\n", DECODER_OPTION_DATAFORMAT);
+		goto label_exit;
+	}
+
+	if( pDecoder->SetOption( DECODER_OPTION_MODE,  &eDecoderMode ) ){
+		fprintf(stderr, "SetOption() failed, opt_id : %d  ..\n", DECODER_OPTION_MODE);
+		goto label_exit;
+	}
+
+	// set the output buffer property
+	if(pYuvFile)
+	{
+		pDecoder->SetOption( DECODER_OPTION_OUTPUT_PROPERTY,  &eOutputProperty );
+	}
+
+#if defined ( STICK_STREAM_SIZE )
+	FILE *fpTrack = fopen("3.len", "rb");	
+
+#endif// STICK_STREAM_SIZE
+	
+
+	while ( true ) {
+
+		if ( iBufPos >= iFileSize ){
+			iEndOfStreamFlag = true;
+			if ( iEndOfStreamFlag )
+				pDecoder->SetOption( DECODER_OPTION_END_OF_STREAM, (void_t*)&iEndOfStreamFlag );
+			break;
+		}
+
+#if defined ( STICK_STREAM_SIZE )
+		if ( fpTrack )
+			fread(&iSliceSize, 1, sizeof(int32_t), fpTrack);		
+#else
+		for (i=0; i<iFileSize; i++) {
+			if (pBuf[iBufPos+i]==0 && pBuf[iBufPos+i+1]==0 && pBuf[iBufPos+i+2]==0 && 
+				pBuf[iBufPos+i+3]==1 && i>0) {
+				break;
+			}
+		}
+		iSliceSize = i;
+#endif
+
+//for coverage test purpose
+        int32_t iOutputColorFormat;
+        pDecoder->GetOption(DECODER_OPTION_DATAFORMAT, &iOutputColorFormat);
+        int32_t iEndOfStreamFlag;
+        pDecoder->GetOption(DECODER_OPTION_END_OF_STREAM, &iEndOfStreamFlag);
+        int32_t iCurIdrPicId;
+        pDecoder->GetOption(DECODER_OPTION_IDR_PIC_ID, &iCurIdrPicId);
+        int32_t iFrameNum;
+        pDecoder->GetOption(DECODER_OPTION_FRAME_NUM, &iFrameNum);
+        int32_t bCurAuContainLtrMarkSeFlag;
+        pDecoder->GetOption(DECODER_OPTION_LTR_MARKING_FLAG, &bCurAuContainLtrMarkSeFlag);
+        int32_t iFrameNumOfAuMarkedLtr;
+        pDecoder->GetOption(DECODER_OPTION_LTR_MARKED_FRAME_NUM, &iFrameNumOfAuMarkedLtr);
+        int32_t iFeedbackVclNalInAu;
+        pDecoder->GetOption(DECODER_OPTION_VCL_NAL, &iFeedbackVclNalInAu);        
+        int32_t iFeedbackTidInAu;
+        pDecoder->GetOption(DECODER_OPTION_TEMPORAL_ID, &iFeedbackTidInAu);
+        int32_t iSetMode;
+        pDecoder->GetOption(DECODER_OPTION_MODE, &iSetMode);
+        int32_t iDeviceInfo;
+        pDecoder->GetOption(DECODER_OPTION_DEVICE_INFO, &iDeviceInfo);
+//~end for
+
+		iStart = WelsTime();
+		pData[0] = NULL;
+		pData[1] = NULL;
+		pData[2] = NULL;
+		memset(&sDstBufInfo, 0, sizeof(SBufferInfo));
+
+		pDecoder->DecodeFrame( pBuf + iBufPos, iSliceSize, pData, &sDstBufInfo );
+		
+		if(sDstBufInfo.iBufferStatus == 1)
+		{
+			pDst[0] = (uint8_t *)pData[0];
+			pDst[1] = (uint8_t *)pData[1];
+			pDst[2] = (uint8_t *)pData[2];
+		}
+		iEnd	= WelsTime();
+		iTotal	+= iEnd - iStart;
+		if ( (sDstBufInfo.iBufferStatus==1) )
+		{
+				iFrameNum++;
+			cOutputModule.Process((void_t **)pDst, &sDstBufInfo, pYuvFile);
+			if (sDstBufInfo.eBufferProperty == BUFFER_HOST)
+			{
+				iWidth  = sDstBufInfo.UsrData.sSystemBuffer.iWidth;
+				iHeight = sDstBufInfo.UsrData.sSystemBuffer.iHeight;
+			}
+			else
+			{
+				iWidth  = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceWidth;
+				iHeight = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceHeight;
+			}
+					
+			if ( pOptionFile != NULL )
+			{
+				if ( iWidth != iLastWidth && iHeight != iLastHeight )
+				{
+					fwrite(&iFrameCount, sizeof(iFrameCount), 1, pOptionFile);
+					fwrite(&iWidth , sizeof(iWidth) , 1, pOptionFile);
+					fwrite(&iHeight, sizeof(iHeight), 1, pOptionFile);
+					iLastWidth  = iWidth;
+					iLastHeight = iHeight;
+				}
+			}
+			++ iFrameCount;
+		}
+
+		iBufPos += iSliceSize;
+		++ iSliceIndex;
+	}
+
+	// Get pending last frame
+	pData[0] = NULL;
+	pData[1] = NULL;
+	pData[2] = NULL;
+	memset(&sDstBufInfo, 0, sizeof(SBufferInfo));
+
+	pDecoder->DecodeFrame( NULL, 0, pData, &sDstBufInfo );
+	if(sDstBufInfo.iBufferStatus == 1)
+	{
+		pDst[0] = (uint8_t *)pData[0];
+		pDst[1] = (uint8_t *)pData[1];
+		pDst[2] = (uint8_t *)pData[2];
+	}
+
+	if ((sDstBufInfo.iBufferStatus==1))
+	{
+		cOutputModule.Process((void_t **)pDst, &sDstBufInfo, pYuvFile);
+		if (sDstBufInfo.eBufferProperty == BUFFER_HOST)
+		{
+			iWidth  = sDstBufInfo.UsrData.sSystemBuffer.iWidth;
+			iHeight = sDstBufInfo.UsrData.sSystemBuffer.iHeight;
+		}
+		else
+		{
+			iWidth  = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceWidth;
+			iHeight = sDstBufInfo.UsrData.sVideoBuffer.iSurfaceHeight;
+		}
+		
+		if ( pOptionFile != NULL )
+		{
+			/* Anyway, we need write in case of final frame decoding */
+			fwrite(&iFrameCount, sizeof(iFrameCount), 1, pOptionFile);
+			fwrite(&iWidth , sizeof(iWidth) , 1, pOptionFile);
+			fwrite(&iHeight, sizeof(iHeight), 1, pOptionFile);
+			iLastWidth	= iWidth;
+			iLastHeight	= iHeight;
+		}
+		++ iFrameCount;
+	}
+
+
+#if defined ( STICK_STREAM_SIZE )
+	if ( fpTrack ){
+		fclose( fpTrack );
+		fpTrack = NULL;
+	}
+#endif// STICK_STREAM_SIZE
+	
+	dElapsed = iTotal / 1e6;
+	fprintf( stderr, "-------------------------------------------------------\n" );
+	fprintf( stderr, "iWidth:		%d\nheight:		%d\nFrames:		%d\ndecode time:	%f sec\nFPS:		%f fps\n",
+			 iWidth, iHeight, iFrameCount, dElapsed, (iFrameCount * 1.0)/dElapsed );
+	fprintf( stderr, "-------------------------------------------------------\n" );
+
+	// coverity scan uninitial
+label_exit:
+	if (pBuf) 
+	{
+		delete[] pBuf;
+		pBuf = NULL;
+	}	
+	if ( pH264File )
+	{
+		fclose(pH264File);
+		pH264File = NULL;
+	}
+	if ( pYuvFile )
+	{
+		fclose(pYuvFile);
+		pYuvFile = NULL;
+	}
+	if ( pOptionFile )
+	{
+		fclose(pOptionFile);
+		pOptionFile = NULL;
+	}
+}
+
+
+#if !defined(__APPLE__)
+int32_t main(int32_t iArgC, char* pArgV[])
+#else
+int32_t DecoderMain(int32_t iArgC, char * pArgV[])
+#endif
+{
+	ISVCDecoder *pDecoder = NULL;
+
+	SDecodingParam sDecParam = {0};
+	string strInputFile(""), strOutputFile(""), strOptionFile("");
+
+	sDecParam.sVideoProperty.size = sizeof( sDecParam.sVideoProperty );
+
+	if (iArgC < 2)
+	{
+		printf( "usage 1: h264dec.exe welsdec.cfg\n" );
+		printf( "usage 2: h264dec.exe welsdec.264 out.yuv\n" );
+		printf( "usage 3: h264dec.exe welsdec.264\n" );
+		return 1;
+	}
+	else if (iArgC == 2)
+	{
+		if (strstr(pArgV[1], ".cfg")) // read config file //confirmed_safe_unsafe_usage
+		{
+			CReadConfig cReadCfg(pArgV[1]);
+			string strTag[4];
+			string strReconFile("");
+
+			if ( !cReadCfg.ExistFile() ){
+				printf("Specified file: %s not exist, maybe invalid path or parameter settting.\n", cReadCfg.GetFileName().c_str());
+				return 1;
+			}
+			memset(&sDecParam, 0, sizeof(sDecParam));
+
+			while ( !cReadCfg.EndOfFile() ){
+				long nRd = cReadCfg.ReadLine(&strTag[0]);
+				if (nRd > 0){
+					if (strTag[0].compare("InputFile") == 0){
+						strInputFile	= strTag[1];
+					}
+					else if (strTag[0].compare("OutputFile") == 0){
+						strOutputFile	= strTag[1];
+					}
+					else if (strTag[0].compare("RestructionFile") == 0){
+						strReconFile	= strTag[1];
+						int32_t iLen = strReconFile.length();
+						sDecParam.pFileNameRestructed	= new char[iLen + 1];
+						if (sDecParam.pFileNameRestructed != NULL){
+							sDecParam.pFileNameRestructed[iLen] = 0;
+						}
+					
+						strncpy(sDecParam.pFileNameRestructed, strReconFile.c_str(), iLen);//confirmed_safe_unsafe_usage
+					}
+					else if (strTag[0].compare("TargetDQID") == 0){
+						sDecParam.uiTargetDqLayer	= (uint8_t)atol(strTag[1].c_str());
+					}
+					else if (strTag[0].compare("OutColorFormat") == 0){
+						sDecParam.iOutputColorFormat = atol(strTag[1].c_str());
+					}
+					else if (strTag[0].compare("ErrorConcealmentFlag") == 0){
+						sDecParam.uiEcActiveFlag	= (uint8_t)atol(strTag[1].c_str());
+					}
+					else if (strTag[0].compare("CPULoad") == 0){
+						sDecParam.uiCpuLoad	= (uint32_t)atol(strTag[1].c_str());
+					}
+					else if (strTag[0].compare("VideoBitstreamType") == 0){
+						sDecParam.sVideoProperty.eVideoBsType = (VIDEO_BITSTREAM_TYPE)atol(strTag[1].c_str());
+					}
+				}
+			}
+			if (strOutputFile.empty())
+			{
+				printf( "No output file specified in configuration file.\n" );
+				return 1;
+			}
+		}
+		else if (strstr(pArgV[1], ".264")) // no output dump yuv file, just try to render the decoded pictures //confirmed_safe_unsafe_usage
+		{
+			strInputFile	= pArgV[1];
+			memset(&sDecParam, 0, sizeof(sDecParam));
+			sDecParam.iOutputColorFormat          = videoFormatI420;
+			sDecParam.uiTargetDqLayer	          = (uint8_t)-1;
+			sDecParam.uiEcActiveFlag	          = 1;
+			sDecParam.sVideoProperty.eVideoBsType = VIDEO_BITSTREAM_DEFAULT;
+		}
+	}
+	else //iArgC > 2
+	{
+		strInputFile	= pArgV[1];
+		strOutputFile	= pArgV[2];
+		memset(&sDecParam, 0, sizeof(sDecParam));
+		sDecParam.iOutputColorFormat	= videoFormatI420;
+		sDecParam.uiTargetDqLayer	= (uint8_t)-1;
+		sDecParam.uiEcActiveFlag	= 1;
+		sDecParam.sVideoProperty.eVideoBsType = VIDEO_BITSTREAM_DEFAULT;
+		if (iArgC > 3)
+			strOptionFile	= pArgV[3];
+
+		if (strOutputFile.empty())
+		{
+			printf( "No output file specified in configuration file.\n" );
+			return 1;
+		}
+	}
+	
+	if (strInputFile.empty())
+	{
+		printf( "No input file specified in configuration file.\n" );
+		return 1;
+	}
+	
+
+
+
+#if defined(_MSC_VER)
+
+	HMODULE hModule = LoadLibraryA(".\\welsdec.dll");
+
+	PCreateDecoderFunc  pCreateDecoderFunc				= NULL;
+	PDestroyDecoderFunc pDestroyDecoderFunc				= NULL;
+
+
+	pCreateDecoderFunc  = (PCreateDecoderFunc)::GetProcAddress(hModule, "CreateDecoder");
+	pDestroyDecoderFunc = (PDestroyDecoderFunc)::GetProcAddress(hModule, "DestroyDecoder");
+
+	if ((hModule != NULL) && (pCreateDecoderFunc != NULL) && (pDestroyDecoderFunc != NULL))
+	{
+		printf("load library sw function successfully\n");
+
+		if ( pCreateDecoderFunc( &pDecoder )  || (NULL == pDecoder) )
+		{
+			printf( "Create Decoder failed.\n" );
+			return 1;
+		}
+	}
+	else 
+	{
+		printf("load library sw function failed\n");
+		return 1;
+	}
+
+
+#elif defined(__APPLE__)
+
+
+	bool flag_load_bundle = load_bundle_welsdec();
+	
+	get_functions_address_create_decoder(&pDecoder);
+	
+	if (flag_load_bundle == false) {
+		printf( "Create Decoder failed.\n" );
+		return NULL;
+	}	
+
+#else
+
+
+	if ( CreateDecoder( &pDecoder )  || (NULL == pDecoder) )
+	{
+		printf( "Create Decoder failed.\n" );
+		return 1;
+	}
+	
+#endif
+
+
+	if ( pDecoder->Initialize( &sDecParam, INIT_TYPE_PARAMETER_BASED ) )
+	{
+		printf( "Decoder initialization failed.\n" );
+		return 1;
+	}
+	
+	
+	int32_t iWidth = 0;
+	int32_t iHeight= 0;
+
+	
+	H264DecodeInstance( pDecoder, strInputFile.c_str(), strOutputFile.c_str(), iWidth, iHeight, (!strOptionFile.empty() ? (void_t*)(const_cast<char*>(strOptionFile.c_str())) : NULL) );
+	
+	if (sDecParam.pFileNameRestructed != NULL){
+		delete []sDecParam.pFileNameRestructed;
+		sDecParam.pFileNameRestructed = NULL;
+	}
+		
+	if ( pDecoder ){
+		pDecoder->Unintialize();
+		
+#if defined(_MSC_VER)
+		pDestroyDecoderFunc( pDecoder );
+#elif defined(__APPLE__)
+		get_functions_address_free_decoder(pDecoder);
+#else
+		DestroyDecoder(pDecoder);
+#endif
+	}
+
+	return 0;
+}
+
--- /dev/null
+++ b/codec/console/dec/src/load_bundle_functions.cpp
@@ -1,0 +1,269 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	load_bundle_functions.cpp
+ *
+ * \brief	load bundle and function on Mac platform
+ *
+ * \date	Created on 03/15/2011
+ *
+ * \description : 1. Load bundle: welsdec.bundle
+ *                2. Load address of function  
+ *                3. Create or destroy decoder
+ *
+ *************************************************************************************
+ */
+
+
+
+
+#include <string.h>
+#include <Carbon/Carbon.h>
+#include <CoreFoundation/CFBundle.h>
+
+#include <dlfcn.h>
+#include <string>
+
+#include "dec_console.h"
+#include "codec_api.h"
+
+typedef long (*LPCreateWelsCSDecoder)(ISVCDecoder** ppDecoder);
+typedef void (*LPDestroyWelsCSDecoder)(ISVCDecoder* pDecoder);
+
+
+typedef long (*LPCreateVHDController)();
+typedef void (*LPDestroyVHDController)();
+
+CFBundleRef g_at264Module = nil;
+
+const char H264DecoderDLL[] = "welsdec.bundle";
+
+CFBundleRef g_at264ModuleHWD = nil;
+
+
+////////////////////////////////////////////////////////////////////////////////////////
+int GetCurrentModulePath(char* lpModulePath, const int iPathMax)
+{
+	if(lpModulePath == NULL || iPathMax <= 0)
+	{
+		return -1;
+	}
+	
+	memset(lpModulePath, 0, iPathMax);
+	
+	char cCurrentPath[PATH_MAX];
+	memset(cCurrentPath, 0, PATH_MAX);
+	
+	Dl_info 	dlInfo;
+	static int  sDummy;
+	dladdr((void*)&sDummy, &dlInfo);
+	
+	strlcpy(cCurrentPath, dlInfo.dli_fname, PATH_MAX);
+	
+#if defined(__apple__)
+	// whether is self a framework ? 
+	int locateNumber = 1;
+	struct FSRef currentPath;
+	OSStatus iStatus = FSPathMakeRef((unsigned char*)cCurrentPath, &currentPath, NULL);
+	if(noErr == iStatus)
+	{
+		LSItemInfoRecord  info;
+		iStatus = LSCopyItemInfoForRef(&currentPath, kLSRequestExtension, &info);
+		if(noErr == iStatus && NULL == info.extension)
+		{
+			locateNumber = 4;
+		}
+	}
+#else
+	int locateNumber = 1;
+#endif
+	
+	std::string strPath(cCurrentPath);
+	int pos = std::string::npos;
+	for(int i = 0; i < locateNumber; i++)
+	{
+		pos = strPath.rfind('/');
+		if(std::string::npos == pos)
+		{
+			break;
+		}
+		strPath.erase(pos);
+	}
+	if(std::string::npos == pos)
+	{
+		return -2;
+	}
+	cCurrentPath[pos] = 0;
+	
+	strlcpy(lpModulePath, cCurrentPath, iPathMax);
+	strlcat(lpModulePath, "/", iPathMax);
+	
+	return 0;
+}
+
+CFBundleRef LoadBundle(const char* lpBundlePath)
+{
+	if(lpBundlePath == NULL)
+	{
+		return NULL;
+	}
+	
+	CFStringRef bundlePath = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpBundlePath, CFStringGetSystemEncoding());
+	if(NULL == bundlePath)
+	{
+		return NULL;
+	}
+	
+	CFURLRef bundleURL = CFURLCreateWithString(kCFAllocatorSystemDefault, bundlePath, NULL);
+	if(NULL == bundleURL)
+	{
+		return NULL;
+	}
+#endif
+	
+	// 2.get bundle ref
+	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
+	CFRelease(bundleURL);
+	
+	if(NULL != bundleRef)
+	{
+	}
+
+	return bundleRef;
+}
+
+void* GetProcessAddress(CFBundleRef bundleRef, const char* lpProcName)
+{
+	void *processAddress = NULL;
+	if(NULL != bundleRef)
+	{
+		CFStringRef cfProcName = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
+		processAddress = CFBundleGetFunctionPointerForName(bundleRef, cfProcName);
+		CFRelease(cfProcName);
+	}
+	return processAddress;
+}
+
+
+////////////////////////
+
+bool load_bundle_welsdec()
+{
+	
+	char achPath[512] = {0};
+	
+	GetCurrentModulePath(achPath, 512);
+	strlcat(achPath, H264DecoderDLL, 512);
+	
+	g_at264Module = LoadBundle(achPath);
+	
+	if (g_at264Module == NULL)
+		return false;
+
+	return true;
+
+}
+
+void free_bundle_welsdec()
+{
+	if(g_at264Module != NULL)
+	{
+		CFBundleUnloadExecutable(g_at264Module);
+	}
+}
+
+bool get_functions_address_create_decoder(ISVCDecoder** ppDecoder)
+{
+	if(!g_at264Module)
+		return false;
+	
+	LPCreateWelsCSDecoder pfuncCreateSWDec = 
+	(LPCreateWelsCSDecoder)GetProcessAddress(g_at264Module, "CreateSVCDecoder");
+	
+	LPCreateVHDController pfuncCreateHWDec = 
+	(LPCreateVHDController)GetProcessAddress(g_at264Module, "CreateSVCVHDController");
+	
+
+	if(pfuncCreateSWDec != NULL)
+	{
+		pfuncCreateSWDec( ppDecoder );
+	}
+	else
+	{
+		return false;
+	}
+	
+	if(pfuncCreateHWDec != NULL)
+	{
+		pfuncCreateHWDec();
+	}
+	else
+	{
+		return false;
+	}
+	
+	return true;
+	
+}
+
+bool get_functions_address_free_decoder(ISVCDecoder* pDecoder)
+{
+	if(!g_at264Module)
+		return false;
+	
+	LPDestroyWelsCSDecoder pfuncDestroySWDec = 
+	(LPDestroyWelsCSDecoder)GetProcessAddress(g_at264Module, "DestroySVCDecoder");
+	
+	LPDestroyVHDController pfuncDestroyHWDec = 
+	(LPDestroyVHDController)GetProcessAddress(g_at264Module, "DestroySVCVHDController");
+	
+	if(pfuncDestroySWDec != NULL)
+	{
+		pfuncDestroySWDec( pDecoder );
+	}
+	else
+	{
+		return false;
+	}
+	
+	if(pfuncDestroyHWDec != NULL)
+	{
+		pfuncDestroyHWDec();
+	}
+	else
+	{
+		return false;
+	}
+
+	return true;
+}
+
+
--- /dev/null
+++ b/codec/console/dec/src/read_config.cpp
@@ -1,0 +1,128 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  read_config.h
+ *
+ *  Abstract
+ *      Class for reading parameter settings in a configure file.
+ *
+ *  History
+ *      08/18/2008 Created
+ *
+ *****************************************************************************/
+#if !defined(WIN32)
+#include <string.h>
+#include <stdio.h>
+#endif
+
+
+
+#include "read_config.h"
+
+CReadConfig::CReadConfig( const char *kpConfigFileName )
+: m_pCfgFile(0)
+, m_strCfgFileName(kpConfigFileName)
+, m_ulLines(0)
+{
+	if ( strlen(kpConfigFileName) > 0 ){	// FIXME: To check validation in configure file name
+		m_pCfgFile = fopen(kpConfigFileName, "r");
+	}
+}
+
+CReadConfig::~CReadConfig()
+{
+	if ( m_pCfgFile ){
+		fclose( m_pCfgFile );
+		m_pCfgFile = NULL;
+	}
+}
+	
+long CReadConfig::ReadLine( string* pStr, const int kiValSize/* = 4*/ )
+{
+	if ( m_pCfgFile == NULL || pStr == NULL || kiValSize <= 1)
+		return 0;
+	
+	string *strTags = &pStr[0];
+	int iTagNum = 0, iNum = 0;
+	bool bCommentFlag = false;	
+	
+	while (iNum < kiValSize) {
+		pStr[iNum]	= "";
+		++ iNum;
+	}	
+
+	do {
+		const char kChar = (char)fgetc(m_pCfgFile);
+		
+		if ( kChar == '\n' || feof(m_pCfgFile) ){
+			++ m_ulLines;
+			break;
+		}
+		if ( kChar == '#' )
+			bCommentFlag = true;
+		if ( !bCommentFlag ){
+			if ( kChar == '\t' || kChar == ' ' ){
+				if ( iTagNum >= kiValSize )
+					break;
+				if ( !(*strTags).empty() ){
+					++ iTagNum;
+					strTags	= &pStr[iTagNum];
+				}
+			}
+			else
+				*strTags += kChar;
+		}
+		
+	} while(true);
+	
+	return 1+iTagNum;
+}
+
+const bool CReadConfig::EndOfFile()
+{
+	if (m_pCfgFile == NULL)
+		return true;
+	return feof(m_pCfgFile) ? true : false;
+}
+
+const int CReadConfig::GetLines()
+{
+	return m_ulLines;
+}
+
+const bool CReadConfig::ExistFile()
+{
+	return (m_pCfgFile != NULL);
+}
+
+const string& CReadConfig::GetFileName()
+{
+	return m_strCfgFileName;
+}
--- /dev/null
+++ b/codec/console/enc/inc/read_config.h
@@ -1,0 +1,82 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  read_config.h
+ *
+ *  Abstract
+ *      Class for reading parameter settings in a configure file.
+ *
+ *  History
+ *      08/18/2008 Created
+ *
+ *****************************************************************************/
+#ifndef READ_CONFIG_H__
+#define READ_CONFIG_H__
+
+#include <stdlib.h>
+#include <string>
+#include "wels_const.h"
+using namespace std;
+
+typedef struct tagFilesSet
+{
+	string strBsFile;
+	string strSeqFile;	// for cmd lines
+	struct
+	{
+		string strLayerCfgFile;
+		string strSeqFile;
+	} sSpatialLayers[MAX_DEPENDENCY_LAYER];
+} SFilesSet;
+
+
+class CReadConfig
+{
+public:
+	CReadConfig();
+	CReadConfig( const char *pConfigFileName );
+	CReadConfig( const string& pConfigFileName );
+	virtual ~CReadConfig();
+	
+	void Openf(const char * strFile);
+	long ReadLine( string* strVal, const int iValSize = 4 );
+	const bool EndOfFile();
+	const int GetLines();
+	const bool ExistFile();
+	const string& GetFileName();
+	
+private:
+	FILE			*m_pCfgFile;
+	string			m_strCfgFileName;
+	unsigned long	m_iLines;
+};
+
+#endif	// READ_CONFIG_H__
+
--- /dev/null
+++ b/codec/console/enc/src/bundlewelsenc.cpp
@@ -1,0 +1,149 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include <string.h>
+#include <Carbon/Carbon.h>
+#include <CoreFoundation/CFBundle.h>
+
+#include <dlfcn.h>
+#include <string>
+
+#include "bundleloader.h"
+#include "codec_api.h"
+
+typedef long (*LPCreateWelsCSEncoder)(ISVCEncoder** ppEncoder);
+typedef void (*LPDestroyWelsCSEncoder)(ISVCEncoder* pEncoder);
+
+CFBundleRef g_at264Module = nil;
+
+const char H264EncoderDLL[] = "welsenc.bundle";
+
+int WelsEncGetCurrentModulePath(char* lpModulePath, const int iPathMax)
+{
+	if(lpModulePath == NULL || iPathMax <= 0)
+	{
+		return -1;
+	}
+	
+	memset(lpModulePath, 0, iPathMax);
+	
+	char cCurrentPath[PATH_MAX];
+	memset(cCurrentPath, 0, PATH_MAX);
+	
+	Dl_info 	dlInfo;
+	static int  sDummy;
+	dladdr((void*)&sDummy, &dlInfo);
+	
+	strlcpy(cCurrentPath, dlInfo.dli_fname, PATH_MAX);
+	
+	int locateNumber = 1;
+	
+	std::string strPath(cCurrentPath);
+	int pos = std::string::npos;
+	for(int i = 0; i < locateNumber; i++)
+	{
+		pos = strPath.rfind('/');
+		if(std::string::npos == pos)
+		{
+			break;
+		}
+		strPath.erase(pos);
+	}
+	if(std::string::npos == pos)
+	{
+		return -2;
+	}
+	cCurrentPath[pos] = 0;
+	
+	strlcpy(lpModulePath, cCurrentPath, iPathMax);
+	strlcat(lpModulePath, "/", iPathMax);
+	
+	return 0;
+	
+}
+
+int32_t WelsEncBundleLoad()
+{
+	
+	char achPath[512] = {0};
+	
+	WelsEncGetCurrentModulePath(achPath, 512);
+	strlcat(achPath, H264EncoderDLL, 512);
+	
+	g_at264Module = LoadBundle(achPath);
+	
+	if (g_at264Module == NULL)
+		return 1;
+	else
+		return 0;
+}
+
+void WelsEncBundleFree()
+{
+	if(g_at264Module != NULL)
+	{
+		CFBundleUnloadExecutable(g_at264Module);
+	}
+}
+
+int32_t WelsEncBundleCreateEncoder(ISVCEncoder** ppEncoder)
+{
+	if(!g_at264Module)
+		return 1;
+	
+	LPCreateWelsCSEncoder pfuncCreateCSEnc = 
+	(LPCreateWelsCSEncoder)GetProcessAddress(g_at264Module, "CreateSVCEncoder");
+	
+	if(pfuncCreateCSEnc != NULL)
+	{
+		return (pfuncCreateCSEnc( ppEncoder ));
+	}
+	
+	return 1;
+}
+
+int32_t WelsEncBundleDestroyEncoder(ISVCEncoder* pEncoder)
+{
+	if(!g_at264Module)
+		return 1;
+	
+	LPDestroyWelsCSEncoder pfuncDestroyCSEnc = 
+	(LPDestroyWelsCSEncoder)GetProcessAddress(g_at264Module, "DestroySVCEncoder");
+	
+	if(pfuncDestroyCSEnc != NULL){
+		pfuncDestroyCSEnc( pEncoder );
+		return 0;
+	}
+	else
+		return 1;
+}
+
+
--- /dev/null
+++ b/codec/console/enc/src/read_config.cpp
@@ -1,0 +1,160 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  read_config.h
+ *
+ *  Abstract
+ *      Class for reading parameter settings in a configure file.
+ *
+ *  History
+ *      08/18/2008 Created
+ *
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <string.h>
+#include "read_config.h"
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable:4996)
+#endif
+
+CReadConfig::CReadConfig()
+: m_pCfgFile( NULL )
+, m_strCfgFileName("")
+, m_iLines( 0 )
+{
+}
+
+CReadConfig::CReadConfig( const char *kpConfigFileName )
+: m_pCfgFile(0)
+, m_strCfgFileName(kpConfigFileName)
+, m_iLines(0)
+{
+	if ( strlen(kpConfigFileName) > 0 ){	// confirmed_safe_unsafe_usage
+		m_pCfgFile = fopen(kpConfigFileName, "r");
+	}
+}
+
+CReadConfig::CReadConfig( const string& kpConfigFileName )
+: m_pCfgFile(0)
+, m_strCfgFileName(kpConfigFileName)
+, m_iLines(0)
+{
+	if ( kpConfigFileName.length() > 0 )
+	{
+		m_pCfgFile = fopen(kpConfigFileName.c_str(), "r");
+	}
+}
+
+CReadConfig::~CReadConfig()
+{
+	if ( m_pCfgFile ){
+		fclose( m_pCfgFile );
+		m_pCfgFile = NULL;
+	}
+}
+
+void CReadConfig::Openf(const char *kpStrFile)
+{
+	if ( kpStrFile != NULL && strlen(kpStrFile) > 0 )	// confirmed_safe_unsafe_usage
+	{
+		m_strCfgFileName = kpStrFile;
+		m_pCfgFile = fopen(kpStrFile, "r");
+	}
+}
+
+long CReadConfig::ReadLine( string* pVal, const int kiValSize/* = 4*/ )
+{
+	if ( m_pCfgFile == NULL || pVal == NULL || kiValSize <= 1)
+		return 0;
+	
+	string *strTags = &pVal[0];
+	int nTagNum = 0, n = 0;
+	bool bCommentFlag = false;	
+	
+	while (n < kiValSize) {
+		pVal[n]	= "";
+		++ n;
+	}	
+
+	do {
+		const char kCh = (char)fgetc(m_pCfgFile);
+		
+		if ( kCh == '\n' || feof(m_pCfgFile) ){
+			++ m_iLines;
+			break;
+		}
+		if ( kCh == '#' )
+			bCommentFlag = true;
+		if ( !bCommentFlag ){
+			if ( kCh == '\t' || kCh == ' ' ){
+				if ( nTagNum >= kiValSize )
+					break;
+				if ( !(*strTags).empty() ){
+					++ nTagNum;
+					strTags	= &pVal[nTagNum];
+				}
+			}
+			else
+				*strTags += kCh;
+		}
+		
+	} while(true);
+	
+	return 1+nTagNum;
+}
+
+const bool CReadConfig::EndOfFile()
+{
+	if (m_pCfgFile == NULL)
+		return true;
+	return feof(m_pCfgFile) ? true : false;
+}
+
+const int CReadConfig::GetLines()
+{
+	return m_iLines;
+}
+
+const bool CReadConfig::ExistFile()
+{
+	return (m_pCfgFile != NULL);
+}
+
+const string& CReadConfig::GetFileName()
+{
+	return m_strCfgFileName;
+}
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
--- /dev/null
+++ b/codec/console/enc/src/welsenc.cpp
@@ -1,0 +1,1564 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+
+#ifdef ONLY_ENC_FRAMES_NUM
+#undef ONLY_ENC_FRAMES_NUM
+#endif//ONLY_ENC_FRAMES_NUM
+#define ONLY_ENC_FRAMES_NUM		INT_MAX // 2, INT_MAX	// type the num you try to encode here, 2, 10, etc
+
+
+
+//#define STICK_STREAM_SIZE
+
+#if defined(__GNUC__)
+#if !defined(MACOS)
+#if !defined(_MATH_H_MATHDEF)
+#define _MATH_H_MATHDEF
+//#else
+//#error "warning: have defined _MATH_H_MATHDEF!!"	// to check
+#endif//_MATH_H_MATHDEF
+#endif//MACOS
+#endif//__GNUC__
+
+#include "measure_time.h"
+#include "param_svc.h"
+//#include "layered_pic_buffer.h"
+#include "read_config.h"
+
+#if defined(MACOS)
+#include "bundlewelsenc.h"
+#else
+#include "typedefs.h"
+#endif//MACOS
+
+#ifdef _MSC_VER
+#include <io.h>     /* _setmode() */
+#include <fcntl.h>  /* _O_BINARY */
+#endif//_MSC_VER
+
+#include "codec_def.h"
+#include "codec_api.h"
+#include "extern.h"
+#include "macros.h"
+#include "wels_const.h"
+
+#ifdef MT_ENABLED
+#include "mt_defs.h"
+#include "WelsThreadLib.h"
+#endif//MT_ENABLED
+
+#include <iostream>
+using namespace std;
+using namespace WelsSVCEnc;
+
+/*
+ *	Layer Context
+ */
+typedef struct LayerpEncCtx_s {
+	int32_t				iDLayerQp;
+	SMulSliceOption	sMso;
+} SLayerPEncCtx;
+
+
+
+/* Ctrl-C handler */
+static int     g_iCtrlC = 0;
+static void    SigIntHandler( int a )
+{
+    g_iCtrlC = 1;
+}
+
+int ParseConfig(CReadConfig& cRdCfg, SWelsSvcCodingParam& pSvcParam, SFilesSet& sFileSet)
+{
+	string strTag[4];
+	int32_t iLeftTargetBitrate = 0;
+	int32_t	iLeftSpatialBitrate[MAX_DEPENDENCY_LAYER] = { 0 };
+	int32_t iRet = 0;
+	int8_t iLayerCount = 0;
+	string str_("SlicesAssign");
+	const int kiSize = str_.size();
+	
+//	memset(&pSvcParam, 0, sizeof(WelsSVCParamConfig));
+
+	while ( !cRdCfg.EndOfFile() ){
+		long iRd = cRdCfg.ReadLine(&strTag[0]);
+		if (iRd > 0){
+			if ( strTag[0].empty() )
+				continue;
+			if (strTag[0].compare("OutputFile") == 0){			
+				sFileSet.strBsFile	= strTag[1];
+				continue;
+			}
+			else if (strTag[0].compare("MaxFrameRate") == 0){
+				pSvcParam.fMaxFrameRate	= (float)atof(strTag[1].c_str());
+				continue;
+			}
+			else if (strTag[0].compare("FramesToBeEncoded") == 0){
+				pSvcParam.uiFrameToBeCoded	= atoi(strTag[1].c_str());
+				continue;
+			}
+			else if ( strTag[0].compare("SourceSequenceInRGB24") == 0 ){
+				pSvcParam.iInputCsp	= atoi(strTag[1].c_str()) == 0 ? videoFormatI420 : videoFormatRGB;
+				continue;
+			}
+			else if (strTag[0].compare("GOPSize") == 0){
+				pSvcParam.uiGopSize	= atoi(strTag[1].c_str());
+				continue;
+			}
+			else if (strTag[0].compare("IntraPeriod") == 0){
+				pSvcParam.uiIntraPeriod	= atoi(strTag[1].c_str());
+				continue;
+			}
+			else if (strTag[0].compare("EnableSpsPpsIDAddition") == 0)
+			{
+				pSvcParam.bEnableSpsPpsIdAddition	= atoi(strTag[1].c_str())?true:false; 
+				continue;
+			}
+			else if (strTag[0].compare("EnableScalableSEI") == 0)
+			{
+				pSvcParam.bEnableSSEI	= atoi(strTag[1].c_str())?true:false;
+				continue;
+			}
+			else if (strTag[0].compare("EnableFrameCropping") == 0)
+			{
+				pSvcParam.bEnableFrameCroppingFlag = (atoi(strTag[1].c_str()) != 0);	
+				continue;
+			}
+			else if (strTag[0].compare("LoopFilterDisableIDC") == 0){
+				pSvcParam.iLoopFilterDisableIdc	= (int8_t)atoi(strTag[1].c_str());
+				if (pSvcParam.iLoopFilterDisableIdc > 6 || pSvcParam.iLoopFilterDisableIdc < 0){
+					fprintf(stderr, "Invalid parameter in iLoopFilterDisableIdc: %d.\n", pSvcParam.iLoopFilterDisableIdc);
+					iRet = 1;
+					break;
+				}
+				continue;
+			}
+			else if (strTag[0].compare("LoopFilterAlphaC0Offset") == 0){
+				pSvcParam.iLoopFilterAlphaC0Offset	= (int8_t)atoi(strTag[1].c_str());
+				if ( pSvcParam.iLoopFilterAlphaC0Offset < -6 )
+					pSvcParam.iLoopFilterAlphaC0Offset	= -6;
+				else if ( pSvcParam.iLoopFilterAlphaC0Offset > 6 )
+					pSvcParam.iLoopFilterAlphaC0Offset	= 6;
+				continue;
+			}
+			else if (strTag[0].compare("LoopFilterBetaOffset") == 0){
+				pSvcParam.iLoopFilterBetaOffset	= (int8_t)atoi(strTag[1].c_str());
+				if ( pSvcParam.iLoopFilterBetaOffset < -6 )
+					pSvcParam.iLoopFilterBetaOffset	= -6;
+				else if ( pSvcParam.iLoopFilterBetaOffset > 6 )
+					pSvcParam.iLoopFilterBetaOffset	= 6;
+				continue;
+			}
+			else if (strTag[0].compare("InterLayerLoopFilterDisableIDC") == 0){
+				pSvcParam.iInterLayerLoopFilterDisableIdc = (int8_t)atoi(strTag[1].c_str());
+				if (pSvcParam.iInterLayerLoopFilterDisableIdc > 6 || pSvcParam.iInterLayerLoopFilterDisableIdc < 0){
+					fprintf(stderr, "Invalid parameter in iInterLayerLoopFilterDisableIdc: %d.\n", pSvcParam.iInterLayerLoopFilterDisableIdc);
+					iRet = 1;
+					break;
+				}
+				continue;
+			}
+			else if (strTag[0].compare("InterLayerLoopFilterAlphaC0Offset") == 0){
+				pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= (int8_t)atoi(strTag[1].c_str());
+				if ( pSvcParam.iInterLayerLoopFilterAlphaC0Offset < -6 )
+					pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= -6;
+				else if ( pSvcParam.iInterLayerLoopFilterAlphaC0Offset > 6 )
+					pSvcParam.iInterLayerLoopFilterAlphaC0Offset	= 6;
+				continue;
+			}
+			else if (strTag[0].compare("InterLayerLoopFilterBetaOffset") == 0){
+				pSvcParam.iInterLayerLoopFilterBetaOffset	= (int8_t)atoi(strTag[1].c_str());
+				if ( pSvcParam.iInterLayerLoopFilterBetaOffset < -6 )
+					pSvcParam.iInterLayerLoopFilterBetaOffset	= -6;
+				else if ( pSvcParam.iInterLayerLoopFilterBetaOffset > 6 )
+					pSvcParam.iInterLayerLoopFilterBetaOffset	= 6;
+				continue;
+			}			
+			else if ( strTag[0].compare("MultipleThreadIdc") == 0 )
+			{
+				// # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+				pSvcParam.iMultipleThreadIdc	= atoi( strTag[1].c_str() );
+				if ( pSvcParam.iMultipleThreadIdc < 0 )
+					pSvcParam.iMultipleThreadIdc = 0;
+				else if ( pSvcParam.iMultipleThreadIdc > MAX_THREADS_NUM )
+					 pSvcParam.iMultipleThreadIdc = MAX_THREADS_NUM;
+				continue;
+			}
+			else if (strTag[0].compare("EnableRC") == 0){
+				pSvcParam.bEnableRc	= atoi(strTag[1].c_str())?true:false;
+				continue;
+			}
+			else if (strTag[0].compare("RCMode") == 0){
+				pSvcParam.iRCMode	= atoi(strTag[1].c_str());
+				continue;
+			}
+			else if (strTag[0].compare("TargetBitrate") == 0){
+				pSvcParam.iTargetBitrate	= 1000 * atoi(strTag[1].c_str());
+				if ( pSvcParam.bEnableRc && pSvcParam.iTargetBitrate <= 0 ){
+					fprintf(stderr, "Invalid target bitrate setting due to RC enabled. Check TargetBitrate field please!\n");
+					return 1;
+				}
+				if ( pSvcParam.bEnableRc ){
+					iLeftTargetBitrate	= pSvcParam.iTargetBitrate;
+				}
+				continue;
+			}
+			else if (strTag[0].compare("EnableDenoise") == 0){
+				pSvcParam.bEnableDenoise	= atoi(strTag[1].c_str())?true:false;
+				continue;
+			}
+			else if (strTag[0].compare("EnableSceneChangeDetection") == 0){
+				pSvcParam.bEnableSceneChangeDetect	= atoi(strTag[1].c_str())?true:false;
+				continue;
+			}
+			else if (strTag[0].compare("EnableBackgroundDetection") == 0)
+			{
+				pSvcParam.bEnableBackgroundDetection	= atoi(strTag[1].c_str())?true:false;
+				continue;
+			}
+			else if (strTag[0].compare("EnableAdaptiveQuantization") == 0){
+				pSvcParam.bEnableAdaptiveQuant	= atoi(strTag[1].c_str())?true:false;
+				continue;
+			}
+			else if (strTag[0].compare("EnableLongTermReference") == 0){
+				pSvcParam.bEnableLongTermReference	= atoi(strTag[1].c_str())?true:false;
+				continue;
+			}
+			else if (strTag[0].compare("LtrMarkPeriod") == 0){
+				pSvcParam.uiLtrMarkPeriod	= (uint32_t)atoi(strTag[1].c_str());
+				continue;
+			}
+			else if (strTag[0].compare("NumLayers") == 0){
+				pSvcParam.iNumDependencyLayer	= (int8_t)atoi(strTag[1].c_str());
+				if (pSvcParam.iNumDependencyLayer > MAX_DEPENDENCY_LAYER || pSvcParam.iNumDependencyLayer <= 0){
+					fprintf(stderr, "Invalid parameter in iNumDependencyLayer: %d.\n", pSvcParam.iNumDependencyLayer);
+					iRet = 1;
+					break;
+				}
+				continue;
+			}
+			else if (strTag[0].compare("LayerCfg") == 0){		
+				if ( strTag[1].length() > 0 )
+					sFileSet.sSpatialLayers[iLayerCount].strLayerCfgFile	= strTag[1];
+//				pSvcParam.sDependencyLayers[iLayerCount].uiDependencyId	= iLayerCount;
+				++ iLayerCount;
+				continue;
+			}
+			else if (strTag[0].compare("PrefixNALAddingCtrl") == 0){
+				int ctrl_flag = atoi(strTag[1].c_str());
+				if (ctrl_flag > 1)
+					ctrl_flag	= 1;
+				else if (ctrl_flag < 0)
+					ctrl_flag	= 0;
+				pSvcParam.bPrefixNalAddingCtrl	= ctrl_flag?true:false;
+				continue;
+			}
+		}
+	}
+
+	const int8_t kiActualLayerNum = WELS_MIN(pSvcParam.iNumDependencyLayer, iLayerCount);
+	if (pSvcParam.iNumDependencyLayer > kiActualLayerNum){	// fixed number of dependency layer due to parameter error in settings
+		pSvcParam.iNumDependencyLayer	= kiActualLayerNum;
+	}
+	
+	assert( kiActualLayerNum <= MAX_DEPENDENCY_LAYER );
+
+	for (int8_t iLayer = 0; iLayer < kiActualLayerNum; ++ iLayer){
+		SLayerPEncCtx sLayerCtx;
+		int32_t iLayerArg = -2;
+		int32_t iNumQualityBitrateLayerSet = 0;
+
+		SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+		CReadConfig cRdLayerCfg( sFileSet.sSpatialLayers[iLayer].strLayerCfgFile );
+
+		memset(&sLayerCtx, 0, sizeof(SLayerPEncCtx));
+
+		if ( !cRdLayerCfg.ExistFile() ){
+			fprintf(stderr, "Unabled to open layer #%d configuration file: %s.\n", iLayer, cRdLayerCfg.GetFileName().c_str());
+			continue;
+		}
+		
+		while ( !cRdLayerCfg.EndOfFile() ){
+			long iLayerRd = cRdLayerCfg.ReadLine(&strTag[0]);
+			bool_t bFound = false;
+			if (iLayerRd > 0){
+				if ( strTag[0].empty() )
+					continue;
+				if (strTag[0].compare("SourceWidth") == 0){
+					pDLayer->iFrameWidth	= atoi(strTag[1].c_str());
+					pDLayer->iActualWidth= pDLayer->iFrameWidth;
+					continue;
+				}
+				else if (strTag[0].compare("SourceHeight") == 0){
+					pDLayer->iFrameHeight	= atoi(strTag[1].c_str());
+					pDLayer->iActualHeight	= pDLayer->iFrameHeight;
+					continue;
+				}
+				else if (strTag[0].compare("FrameRateIn") == 0){
+					pDLayer->fInputFrameRate	= (float)atof(strTag[1].c_str());
+					continue;
+				}
+				else if (strTag[0].compare("FrameRateOut") == 0){
+					pDLayer->fOutputFrameRate = (float)atof(strTag[1].c_str());
+					continue;
+				}
+				else if (strTag[0].compare("InputFile") == 0){		
+					if ( strTag[1].length() > 0 )
+						sFileSet.sSpatialLayers[iLayer].strSeqFile	= strTag[1];
+					continue;
+				}
+				else if (strTag[0].compare("ReconFile") == 0){
+					const int kiLen = strTag[1].length();
+					if (kiLen >= MAX_FNAME_LEN)
+						return 1;
+#ifdef ENABLE_FRAME_DUMP
+					pDLayer->sRecFileName[kiLen] = '\0';
+					strncpy(pDLayer->sRecFileName, strTag[1].c_str(), kiLen);	// confirmed_safe_unsafe_usage
+#endif//ENABLE_FRAME_DUMP
+					continue;
+				}
+				else if (strTag[0].compare("ProfileIdc") == 0){
+					pDLayer->uiProfileIdc	= atoi(strTag[1].c_str());
+					continue;
+				}
+				else if (strTag[0].compare("FRExt") == 0){
+//					pDLayer->frext_mode	= (bool_t)atoi(strTag[1].c_str());
+					continue;
+				}
+
+				if (strTag[0].compare("SpatialBitrate") == 0){
+					pDLayer->iSpatialBitrate	= 1000 * atoi(strTag[1].c_str());
+					if ( pSvcParam.bEnableRc && pDLayer->iSpatialBitrate <= 0 ){
+						fprintf(stderr, "Invalid spatial bitrate(%d) in dependency layer #%d.\n", pDLayer->iSpatialBitrate, iLayer);
+						return 1;
+					}
+					if ( pSvcParam.bEnableRc &&pDLayer->iSpatialBitrate > iLeftTargetBitrate ){ 
+						fprintf(stderr, "Invalid spatial(#%d) bitrate(%d) setting due to unavailable left(%d)!\n", iLayer, pDLayer->iSpatialBitrate, iLeftTargetBitrate);
+						return 1;
+					}
+					iLeftSpatialBitrate[iLayer]	= pDLayer->iSpatialBitrate;
+					continue;
+				}
+				if (strTag[0].compare("InitialQP") == 0){
+					sLayerCtx.iDLayerQp	= atoi(strTag[1].c_str());
+					continue;
+				}
+				if (strTag[0].compare("SliceMode") == 0){
+					sLayerCtx.sMso.uiSliceMode	= (SliceMode)atoi(strTag[1].c_str());
+					continue;
+				}
+				else if (strTag[0].compare("SliceSize") == 0){//SM_DYN_SLICE
+					sLayerCtx.sMso.sSliceArgument.uiSliceSizeConstraint	= (SliceMode)atoi(strTag[1].c_str());
+					continue;
+				}
+				else if (strTag[0].compare("SliceNum") == 0){
+					sLayerCtx.sMso.sSliceArgument.iSliceNum = atoi(strTag[1].c_str());
+					continue;
+				}
+				else if ( strTag[0].compare(0, kiSize, str_ ) == 0 )
+				{
+					const char* kpString = strTag[0].c_str();
+					int uiSliceIdx = atoi(&kpString[kiSize]);
+					assert( uiSliceIdx < MAX_SLICES_NUM );
+					sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[uiSliceIdx] = atoi( strTag[1].c_str() );
+					continue;
+				}
+			}
+		}
+		pDLayer->iDLayerQp	= sLayerCtx.iDLayerQp;
+		pDLayer->sMso.uiSliceMode		= sLayerCtx.sMso.uiSliceMode;		
+
+		memcpy( &pDLayer->sMso, &sLayerCtx.sMso, sizeof(SMulSliceOption) );	// confirmed_safe_unsafe_usage
+		memcpy( &pDLayer->sMso.sSliceArgument.uiSliceMbNum[0], &sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[0], sizeof(sLayerCtx.sMso.sSliceArgument.uiSliceMbNum) );	// confirmed_safe_unsafe_usage
+	}
+
+	return iRet;
+}
+
+int ParseCommandLine( int argc, char ** argv, SVCEncodingParam & sParam)
+{
+	char * pCmd;
+	int i = 0;
+
+	if (argc <= 0) // no additional pCmd parameters 
+		return 0;
+
+	while ( i < argc )
+	{
+		pCmd = argv[i];
+
+		if( !strcmp(pCmd, "-numl") ) {	// confirmed_safe_unsafe_usage
+			int  iNumSpatial = atoi(argv[i+1]);
+			sParam.iSpatialLayerNum = iNumSpatial;
+			i += 2;
+		} else if( !strcmp(pCmd, "-numt") ) {	// confirmed_safe_unsafe_usage
+			int  iNumTemporal = atoi(argv[i+1]);
+			sParam.iTemporalLayerNum = iNumTemporal;
+			i += 2;
+		} else if( !strcmp(pCmd,"-iper") ) {	// confirmed_safe_unsafe_usage
+			int iPeriod = atoi(argv[i+1]);
+			sParam.iIntraPeriod = iPeriod;
+			i += 2;
+		}
+		else if( !strcmp(pCmd,"-spsid") ) {	// confirmed_safe_unsafe_usage
+			int iSpsPpsId = atoi(argv[i+1]);
+			sParam.bEnableSpsPpsIdAddition = iSpsPpsId?true:false;
+			i += 2;
+		} 
+		else if( !strcmp(pCmd,"-denois") ) {	// confirmed_safe_unsafe_usage
+			int iDenois = atoi(argv[i+1]);
+			sParam.bEnableDenoise = iDenois?true:false;
+			i += 2;
+		} else if( !strcmp(pCmd,"-bgd") ) {	// confirmed_safe_unsafe_usage
+			int iBgd = atoi(argv[i+1]);
+			sParam.bEnableBackgroundDetection = iBgd?true:false;
+			i += 2;
+		} else if( !strcmp(pCmd,"-aq") ) {	// confirmed_safe_unsafe_usage
+			int iAq = atoi(argv[i+1]);
+			sParam.bEnableAdaptiveQuant = iAq?true:false;
+			i += 2;
+		} else if( !strcmp(pCmd,"-ltr") ) {	// confirmed_safe_unsafe_usage
+			int iLtr = atoi(argv[i+1]);
+			sParam.bEnableLongTermReference = iLtr?true:false;
+			i += 2;
+		} else if( !strcmp(pCmd,"-ltrper") ) {	// confirmed_safe_unsafe_usage
+			int iLtrPer = atoi(argv[i+1]);
+			sParam.iLtrMarkPeriod = iLtrPer;
+			i += 2;	
+		} else if( !strcmp(pCmd,"-rcm") ) {	// confirmed_safe_unsafe_usage
+			int iRcMode = atoi(argv[i+1]);
+			sParam.iRCMode = iRcMode;
+			i += 2;
+		} else if( !strcmp(pCmd,"-tarb") ) {	// confirmed_safe_unsafe_usage
+			int iTarB = atoi(argv[i+1]);
+			sParam.iTargetBitrate = iTarB;
+			i += 2;
+		} else if( !strcmp(pCmd,"-ltarb") )	// confirmed_safe_unsafe_usage
+		{
+			int	iLayer = atoi( argv[i+1] );
+			int iSpatialBitrate = atoi( argv[i+2] );
+			sParam.sSpatialLayers[iLayer].iSpatialBitrate	= iSpatialBitrate;
+			i += 3;
+		} else {
+			i ++;
+		}		
+	}
+
+    return 0;
+}
+
+void PrintHelp()
+{
+	printf("\n Wels SVC Encoder Usage:\n\n");
+	printf(" Syntax: welsenc.exe welsenc.cfg\n");
+	printf(" Syntax: welsenc.exe welsenc.cfg [options]\n");
+
+	printf("\n Supported Options:\n");
+	printf("  -h      Print Help\n");
+	printf("  -bf     Bit Stream File\n");
+	printf("  -frms   Number of total frames to be encoded\n");
+	printf("  -gop    GOPSize - GOP size (2,4,8,16,32,64, default: 1)\n");
+	printf("  -iper   Intra period (default: -1) : must be a power of 2 of GOP size (or -1)\n");
+	printf("  -spsid   Enable id adding in SPS/PPS per IDR \n");
+	printf("  -denois Control denoising  (default: 0)\n");
+	printf("  -scene  Control scene change detection (default: 0)\n");
+	printf("  -bgd    Control background detection (default: 0)\n");
+	printf("  -aq     Control adaptive quantization (default: 0)\n");
+	printf("  -ltr    Control long term reference (default: 0)\n");
+	printf("  -rc	  Control rate control: 0-disable; 1-enable \n");
+	printf("  -tarb	  Overall target bitrate\n");
+	printf("  -numl   Number Of Layers: Must exist with layer_cfg file and the number of input layer_cfg file must equal to the value set by this command\n");
+	printf("  The options below are layer-based: (need to be set with layer id)\n");
+	printf("  -org		(Layer) (original file); example: -org 0 src.yuv\n");
+	printf("  -drec		(Layer) (reconstruction file); Setting the reconstruction file, this will only functioning when dumping reconstruction is enabled\n");
+	printf("  -sw		(Layer) (source width)\n");
+	printf("  -sh		(Layer) (source height)\n");
+	printf("  -frin		(Layer) (input frame rate)\n");
+	printf("  -frout  	(Layer) (output frame rate)\n");
+	printf("  -lqp		(Layer) (base quality layer qp : must work with -ldeltaqp or -lqparr)\n");
+	printf("  -ltarb	    (Layer) (spatial layer target bitrate)\n");
+	printf("  -slcmd   (Layer) (spatial layer slice mode): pls refer to layerX.cfg for details ( -slcnum: set target slice num; -slcsize: set target slice size constraint ) \n");
+	printf("\n");
+}
+
+int ParseCommandLine(int argc, char** argv, SWelsSvcCodingParam & pSvcParam, SFilesSet& sFileSet) 
+{
+	char* pCommand = NULL;
+	char* pTemp = NULL;
+	unsigned int uiQpChangeFlag[4] = {0};
+	unsigned int uiQlPredModeChangeFlag[4] = {0};
+	SLayerPEncCtx sLayerCtx[3];
+	int n = 0;
+	string str_("SlicesAssign");
+	const int kiSize = str_.size();
+
+	if (argc <= 0) // no additional pCmd parameters 
+		return 0;
+
+	while(n < argc)
+	{
+		pCommand = argv[n++];
+		if (!(strcmp(pCommand,"-h")))	// confirmed_safe_unsafe_usage
+		{
+			PrintHelp();
+			continue;
+		}
+		if (!(strcmp(pCommand,"-bf")))	// confirmed_safe_unsafe_usage
+		{			
+			sFileSet.strBsFile.assign(argv[n]);
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-frms")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.uiFrameToBeCoded = atoi(argv[n ]);
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-gop")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.uiGopSize = atoi(argv[n ]);
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-iper")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.uiIntraPeriod = atoi(argv[n ]);
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-spsid")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.bEnableSpsPpsIdAddition = atoi(argv[n ])?true:false;
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-denois")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.bEnableDenoise = atoi(argv[n ])?true:false;
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-scene")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.bEnableSceneChangeDetect = atoi(argv[n ])?true:false;
+			++ n;
+			continue;
+		}
+		if ( !(strcmp(pCommand,"-bgd")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.bEnableBackgroundDetection = atoi(argv[n ])?true:false;
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-aq")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.bEnableAdaptiveQuant = atoi(argv[n ])?true:false;
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-ltr")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.bEnableLongTermReference = atoi(argv[n ])?true:false;
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-ltrper")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.uiLtrMarkPeriod = atoi(argv[n ]);
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-rc")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.bEnableRc = atoi(argv[n ])?true:false;
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-tarb")) )	// confirmed_safe_unsafe_usage
+		{
+			pSvcParam.iTargetBitrate = atoi(argv[n ]);
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-numl")) )	// confirmed_safe_unsafe_usage
+		{
+			bool_t bFound = false;
+			pSvcParam.iNumDependencyLayer = atoi(argv[n++]);
+			for (int ln = 0 ; ln < pSvcParam.iNumDependencyLayer ; ln++)
+			{
+//				pSvcParam.sDependencyLayers[ln].uiDependencyId = ln;				
+				sFileSet.sSpatialLayers[ln].strLayerCfgFile.assign( argv[n] );
+				++ n;
+			}
+
+			for (int8_t iLayer = 0; iLayer < pSvcParam.iNumDependencyLayer; ++ iLayer){
+				SLayerPEncCtx sLayerCtx;	
+				string strTag[4];
+				int32_t iLayerArg = -2;
+				int32_t iNumQualityBitrateLayerSet = 0;
+
+				SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+				CReadConfig cRdLayerCfg( sFileSet.sSpatialLayers[iLayer].strLayerCfgFile );
+
+				memset(&sLayerCtx, 0, sizeof(SLayerPEncCtx));
+
+//				pDLayer->frext_mode = 0;
+				if ( !cRdLayerCfg.ExistFile() ){
+					fprintf(stderr, "Unabled to open layer #%d configuration file: %s.\n", iLayer, cRdLayerCfg.GetFileName().c_str());
+					continue;
+				}
+				
+				while ( !cRdLayerCfg.EndOfFile() ){
+					long iLayerRd = cRdLayerCfg.ReadLine(&strTag[0]);
+					if (iLayerRd > 0){
+						if ( strTag[0].empty() )
+							continue;
+						if (strTag[0].compare("SourceWidth") == 0){
+							pDLayer->iFrameWidth	= atoi(strTag[1].c_str());
+							pDLayer->iActualWidth= pDLayer->iFrameWidth;
+							continue;
+						}
+						else if (strTag[0].compare("SourceHeight") == 0){
+							pDLayer->iFrameHeight	= atoi(strTag[1].c_str());
+							pDLayer->iActualHeight	= pDLayer->iFrameHeight;
+							continue;
+						}
+						else if (strTag[0].compare("FrameRateIn") == 0){
+							pDLayer->fInputFrameRate	= (float)atof(strTag[1].c_str());
+							continue;
+						}
+						else if (strTag[0].compare("FrameRateOut") == 0){
+							pDLayer->fOutputFrameRate = (float)atof(strTag[1].c_str());
+							continue;
+						}
+						else if (strTag[0].compare("InputFile") == 0){							
+							if ( strTag[1].length() > 0 )
+								sFileSet.sSpatialLayers[iLayer].strSeqFile = strTag[1];
+							continue;
+						}
+						else if (strTag[0].compare("ReconFile") == 0){
+#ifdef ENABLE_FRAME_DUMP
+							const int kiLen = strTag[1].length();
+							if (kiLen >= MAX_FNAME_LEN)
+								return 1;
+							pDLayer->sRecFileName[kiLen] = '\0';
+							strncpy(pDLayer->sRecFileName, strTag[1].c_str(), kiLen);	// confirmed_safe_unsafe_usage
+#endif//ENABLE_FRAME_DUMP
+							continue;
+						}
+						else if (strTag[0].compare("ProfileIdc") == 0){
+							pDLayer->uiProfileIdc	= atoi(strTag[1].c_str());
+							continue;
+						}
+						else if (strTag[0].compare("FRExt") == 0){
+//							pDLayer->frext_mode	= (bool_t)atoi(strTag[1].c_str());
+							continue;
+						}	
+						if (strTag[0].compare("SpatialBitrate") == 0){
+							pDLayer->iSpatialBitrate	= 1000 * atoi(strTag[1].c_str());
+							continue;
+						}
+
+						if (strTag[0].compare("InitialQP") == 0){
+							sLayerCtx.iDLayerQp	= atoi(strTag[1].c_str());
+							continue;
+						}
+
+						if (strTag[0].compare("SliceMode") == 0){
+							sLayerCtx.sMso.uiSliceMode	= (SliceMode)atoi(strTag[1].c_str());
+							continue;
+						}
+						else if (strTag[0].compare("SliceSize") == 0){//SM_DYN_SLICE
+							sLayerCtx.sMso.sSliceArgument.uiSliceSizeConstraint	= (SliceMode)atoi(strTag[1].c_str());
+							continue;
+						}
+						else if (strTag[0].compare("SliceNum") == 0){
+							sLayerCtx.sMso.sSliceArgument.iSliceNum = atoi(strTag[1].c_str());
+							continue;
+						}
+						else if ( strTag[0].compare(0, kiSize, str_ ) == 0 )
+						{
+							const char* kpString = strTag[0].c_str();
+							int uiSliceIdx = atoi(&kpString[kiSize]);
+							assert( uiSliceIdx < MAX_SLICES_NUM );
+							sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[uiSliceIdx] = atoi( strTag[1].c_str() );
+							continue;
+						}
+					}
+				}
+				pDLayer->iDLayerQp		= sLayerCtx.iDLayerQp;
+				pDLayer->sMso.uiSliceMode		= sLayerCtx.sMso.uiSliceMode;		
+	memcpy( &pDLayer->sMso, &sLayerCtx.sMso, sizeof(SMulSliceOption) );	// confirmed_safe_unsafe_usage
+		memcpy( &pDLayer->sMso.sSliceArgument.uiSliceMbNum[0], &sLayerCtx.sMso.sSliceArgument.uiSliceMbNum[0], sizeof(sLayerCtx.sMso.sSliceArgument.uiSliceMbNum) );	// confirmed_safe_unsafe_usage
+
+			}
+			//n += 1;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-org")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			sFileSet.sSpatialLayers[iLayer].strSeqFile.assign( argv[n] );
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-drec")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			const int iLen = strlen(argv[n]);	// confirmed_safe_unsafe_usage
+#ifdef ENABLE_FRAME_DUMP
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			pDLayer->sRecFileName[iLen] = '\0';
+			strncpy(pDLayer->sRecFileName, argv[n], iLen);	// confirmed_safe_unsafe_usage
+#endif//ENABLE_FRAME_DUMP
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-sw")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			pDLayer->iFrameWidth =  atoi(argv[n ]);
+			pDLayer->iActualWidth= pDLayer->iFrameWidth;
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-sh")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			pDLayer->iFrameHeight =  atoi(argv[n ]);
+			pDLayer->iActualHeight= pDLayer->iFrameHeight;
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-frin")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			pDLayer->fInputFrameRate =  (float)atof(argv[n ]);
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-frout")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			pDLayer->fOutputFrameRate =  (float)atof(argv[n ]);
+			++ n;
+			continue;
+		}	
+
+		if( !(strcmp(pCommand,"-lqp")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			uiQpChangeFlag[iLayer] = 1;
+			pDLayer->iDLayerQp = sLayerCtx[iLayer].iDLayerQp=  atoi(argv[n ]);
+			n += 1;
+			continue;
+		}
+		//sLayerCtx[iLayer].num_quality_layers = pDLayer->num_quality_layers = 1;
+
+		if( !(strcmp(pCommand,"-ltarb")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			pDLayer->iSpatialBitrate	= 1000 * atoi(argv[n ]);
+			++ n;
+			continue;
+		}
+
+		if( !(strcmp(pCommand,"-slcmd")) )	// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+
+			switch ( atoi(argv[n] ) )
+			{
+			case 0: 
+				pDLayer->sMso.uiSliceMode = SM_SINGLE_SLICE;
+				break;
+			case 1: 
+				pDLayer->sMso.uiSliceMode = SM_FIXEDSLCNUM_SLICE;
+				break;
+			case 2: 
+				pDLayer->sMso.uiSliceMode = SM_RASTER_SLICE;
+				break;
+			case 3: 
+				pDLayer->sMso.uiSliceMode = SM_ROWMB_SLICE;
+				break;
+			case 4: 
+				pDLayer->sMso.uiSliceMode = SM_DYN_SLICE;
+				break;
+			default: 
+				pDLayer->sMso.uiSliceMode = SM_RESERVED;
+				break;
+			}
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-slcsize")) )//confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			pDLayer->sMso.sSliceArgument.uiSliceSizeConstraint = atoi(argv[n ]);
+			++ n;
+			continue;
+		}
+		if( !(strcmp(pCommand,"-slcnum")) )// confirmed_safe_unsafe_usage
+		{
+			unsigned int	iLayer = atoi( argv[n++] );
+			SDLayerParam *pDLayer = &pSvcParam.sDependencyLayers[iLayer];
+			pDLayer->sMso.sSliceArgument.iSliceNum = atoi(argv[n ]);
+			++ n;
+			continue;
+		}
+	}
+	return 0;
+}
+
+
+
+int FillSpecificParameters( SVCEncodingParam &sParam )
+{
+	/* Test for temporal, spatial, SNR scalability */
+	sParam.fFrameRate	= 30.0f;		// input frame rate  
+	sParam.iPicWidth		= 1280;			// width of picture in samples
+	sParam.iPicHeight	= 720;			// height of picture in samples
+	sParam.iTargetBitrate= 2500000;		// target bitrate desired
+	sParam.iRCMode       = 0;            //  rc mode control
+	sParam.iTemporalLayerNum= 3;	// layer number at temporal level
+	sParam.iSpatialLayerNum	= 4;	// layer number at spatial level
+	sParam.bEnableDenoise    = 0;    // denoise control
+	sParam.bEnableBackgroundDetection = 1; // background detection control	
+	sParam.bEnableAdaptiveQuant       = 1; // adaptive quantization control
+	sParam.bEnableLongTermReference  = 0; // long term reference control
+	sParam.iLtrMarkPeriod = 30;
+
+	sParam.iInputCsp			= videoFormatI420;			// color space of input sequence
+	sParam.iKeyPicCodingMode= 1;// mode of key picture coding
+	sParam.iIntraPeriod		= 320;		// period of Intra frame
+	sParam.bEnableSpsPpsIdAddition = 1;
+	sParam.bPrefixNalAddingCtrl = 1;
+
+	int iIndexLayer = 0;
+	sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 160;
+	sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 90;
+	sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 7.5f;
+	sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
+	sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 64000;
+	sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;
+//	memset(sParam.iTemporalBitrate, 0, sizeof(sParam.iTemporalBitrate));
+	sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
+#ifdef MT_ENABLED
+	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;  
+#endif
+
+	++ iIndexLayer;
+	sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 320;
+	sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 180;
+	sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 15.0f;
+	sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
+	sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 160000;
+	sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;	
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;	
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[2]	= 0;
+	sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;	
+#ifdef MT_ENABLED
+	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0; 
+#endif
+
+	++ iIndexLayer;
+	sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 640;
+	sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 360;
+	sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 30.0f;
+	sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
+	sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 512000;
+	sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;	
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;	
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[2]	= 0;
+	sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
+#ifdef MT_ENABLED
+	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;                  
+    sParam.sSpatialLayers[iIndexLayer].sSliceCfg.sSliceArgument.uiSliceNum = 1;    
+#endif
+
+	++ iIndexLayer;
+	sParam.sSpatialLayers[iIndexLayer].iVideoWidth	= 1280;
+	sParam.sSpatialLayers[iIndexLayer].iVideoHeight	= 720;
+	sParam.sSpatialLayers[iIndexLayer].fFrameRate	= 30.0f;
+	sParam.sSpatialLayers[iIndexLayer].iQualityLayerNum	    = 1;
+	sParam.sSpatialLayers[iIndexLayer].iSpatialBitrate		= 1500000;
+	sParam.sSpatialLayers[iIndexLayer].iCgsSnrRefined		= 0;
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[0]	= 0;	
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[1]	= 0;	
+//	sParam.sSpatialLayers[iIndexLayer].iQualityBitrate[2]	= 0;
+	sParam.sSpatialLayers[iIndexLayer].iInterSpatialLayerPredFlag	= 0;
+#ifdef MT_ENABLED
+	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.uiSliceMode = 0;  
+	sParam.sSpatialLayers[iIndexLayer].sSliceCfg.sSliceArgument.uiSliceNum = 1; 
+#endif
+
+	float fMaxFr = sParam.sSpatialLayers[sParam.iSpatialLayerNum-1].fFrameRate;
+	for (int32_t i = sParam.iSpatialLayerNum-2; i >= 0; -- i)
+	{
+		if (sParam.sSpatialLayers[i].fFrameRate > fMaxFr+EPSN)
+			fMaxFr = sParam.sSpatialLayers[i].fFrameRate;
+	}
+	sParam.fFrameRate = fMaxFr;
+
+	return 0;
+}
+
+/* For SVC Demo test */
+int ProcessEncodingSvcWithParam ( ISVCEncoder *pPtrEnc, int argc, char ** argv )
+{
+    const char * kpSrcFile = argv[1];
+	const char * kpStrBsFile = argv[2];
+
+	if ( pPtrEnc == NULL || kpSrcFile == NULL || kpStrBsFile == NULL )
+		return 1;
+
+	FILE *pFpBs = NULL;
+	FILE *pFpSrc= NULL;
+	SFrameBSInfo sFbi;
+	SVCEncodingParam sSvcParam;
+	int64_t iStart = 0, iTotal = 0;
+#if defined ( STICK_STREAM_SIZE )
+	FILE *fTrackStream = fopen("coding_size.stream", "wb");;
+#endif
+
+	pFpSrc	= fopen(kpSrcFile, "rb");
+	if ( NULL == pFpSrc )
+		return 1;
+	pFpBs	= fopen(kpStrBsFile, "wb");
+	if ( NULL == pFpBs){
+		fclose( pFpSrc );
+		pFpSrc = NULL;
+		return 1;
+	}
+
+	memset( &sFbi, 0, sizeof(SFrameBSInfo) );
+	memset( &sSvcParam, 0, sizeof(SVCEncodingParam) );
+
+	FillSpecificParameters(sSvcParam);
+
+	int iParsedNum = 3;
+	if( ParseCommandLine(argc-iParsedNum, argv+iParsedNum, sSvcParam) != 0 )
+	{
+		printf("parse pCommand line failed\n");
+		return 1;
+	}
+
+	if ( cmResultSuccess != pPtrEnc->Initialize( &sSvcParam, INIT_TYPE_PARAMETER_BASED ) )
+	{
+		fprintf(stderr, "Encoder Initialization failed!\n");
+		return 1;
+	}
+
+	const int32_t iPicLumaSize = sSvcParam.iPicWidth * sSvcParam.iPicHeight;
+	int32_t iFrameSize = 0;
+	uint8_t *pPlanes[3] = { 0 };
+
+	switch( sSvcParam.iInputCsp ) {
+		int iStride;
+	case videoFormatI420:
+	case videoFormatYV12:
+		iFrameSize  = (3 * iPicLumaSize)>>1;
+		pPlanes[0]	= new uint8_t[iFrameSize];
+		pPlanes[1]	= pPlanes[0] + iPicLumaSize;
+		pPlanes[2]	= pPlanes[1]	+ (iPicLumaSize>>2);
+		break;	
+	case videoFormatYUY2:
+	case videoFormatYVYU:
+	case videoFormatUYVY:
+		iStride      = CALC_BI_STRIDE(sSvcParam.iPicWidth,  16);
+		iFrameSize  = iStride * sSvcParam.iPicHeight;
+		pPlanes[0]   = new uint8_t[iFrameSize];
+		break;
+	case videoFormatRGB:
+	case videoFormatBGR:
+		iStride      = CALC_BI_STRIDE(sSvcParam.iPicWidth,  24);
+		iFrameSize  = iStride * sSvcParam.iPicHeight;
+		pPlanes[0]	= new uint8_t[iFrameSize];
+		break;
+	case videoFormatBGRA:
+	case videoFormatRGBA:
+	case videoFormatARGB:
+	case videoFormatABGR:
+		iStride = 4 * sSvcParam.iPicWidth;
+		iFrameSize  = iStride * sSvcParam.iPicHeight;
+		pPlanes[0]	= new uint8_t[iFrameSize];
+		break;
+	default:
+		return 1;
+	}
+	
+	int32_t iFrame = 0;
+	while (true) 
+	{
+		if ( feof(pFpSrc) )
+			break;
+#ifdef ONLY_ENC_FRAMES_NUM
+		if ( iFrame >= ONLY_ENC_FRAMES_NUM )
+			break;
+#endif//ONLY_ENC_FRAMES_NUM
+		if ( fread(pPlanes[0], sizeof(uint8_t), iFrameSize, pFpSrc) <= 0 )
+				break;
+
+		iStart	= WelsTime();
+		long iEncode = pPtrEnc->EncodeFrame( pPlanes[0], &sFbi);
+		iTotal += WelsTime() - iStart;
+		if ( videoFrameTypeInvalid == iEncode ){
+			fprintf(stderr, "EncodeFrame() failed: %d.\n", iEncode);
+			break;
+		}
+
+		/* Write bit-stream */
+		if ( pFpBs != NULL && videoFrameTypeSkip != iEncode ){	// file handler to write bit stream
+			int iLayer = 0;
+			while ( iLayer < sFbi.iLayerNum ){
+				SLayerBSInfo *pLayerBsInfo = &sFbi.sLayerInfo[iLayer];
+				if ( pLayerBsInfo != NULL ){
+					int iLayerSize = 0;
+					int iNalIdx = pLayerBsInfo->iNalCount -1;
+					do {
+						iLayerSize += pLayerBsInfo->iNalLengthInByte[iNalIdx];
+						-- iNalIdx;
+					} while(iNalIdx >= 0);
+					fwrite(pLayerBsInfo->pBsBuf, 1, iLayerSize, pFpBs);	// write pure bit stream into file
+				}
+				++ iLayer;
+			}
+			++ iFrame;
+		}		
+	}
+
+	if (iFrame > 0){
+		double dElapsed = iTotal / 1e6;
+		printf( "Frames:		%d\nencode time:	%f sec\nFPS:		%f fps\n", iFrame, dElapsed, (iFrame * 1.0)/dElapsed );
+	}
+
+	if ( NULL != pPlanes[0] )
+	{
+        delete [] pPlanes[0];
+		pPlanes[0] = NULL;
+	}
+
+	if ( pFpBs ){
+		fclose( pFpBs );
+		pFpBs = NULL;
+	}
+	if ( pFpSrc ){
+		fclose( pFpSrc );
+		pFpSrc= NULL;
+	}
+
+	return 0;
+}
+
+
+int ProcessEncodingSvcWithConfig ( ISVCEncoder *pPtrEnc, int argc, char **argv )
+{
+	int iRet				= 0;	 
+
+	if ( pPtrEnc == NULL )	
+		return 1;
+	
+	SFrameBSInfo sFbi;
+	SWelsSvcCodingParam sSvcParam;
+	int64_t iStart = 0, iTotal = 0;
+
+	// Preparing encoding process
+	FILE* pFileYUV[MAX_DEPENDENCY_LAYER] = {0};
+	int32_t iActualFrameEncodedCount = 0;
+	int32_t iFrameIdx = 0;
+	int32_t	iTotalFrameMax = -1;
+	int8_t  iDlayerIdx = 0;
+	uint8_t * pYUV[MAX_DEPENDENCY_LAYER] = { 0 };
+	SSourcePicture  **  pSrcPicList = NULL;
+#if (defined(RUN_SIMULATOR) || defined(WIN32)||defined(_MACH_PLATFORM) || (defined(__GNUC__)))
+	// Inactive with sink with output file handler
+	FILE *pFpBs = NULL;
+#endif
+#if defined(COMPARE_DATA)
+	//For getting the golden file handle
+	FILE *fpGolden = NULL;
+#endif
+#if defined ( STICK_STREAM_SIZE )
+	FILE *fTrackStream = fopen("coding_size.stream", "wb");;
+#endif
+	SFilesSet fs;
+	// for configuration file
+	CReadConfig cRdCfg;
+	int iParsedNum = 2;
+
+	memset(&sFbi, 0, sizeof(SFrameBSInfo));
+	memset(&sSvcParam, 0, sizeof(SWelsSvcCodingParam));	
+
+	sSvcParam.iInputCsp	= videoFormatI420;	// I420 in default
+	sSvcParam.sDependencyLayers[0].uiProfileIdc	= PRO_BASELINE;
+//	svc_cfg->sDependencyLayers[0].frext_mode	= 0;
+
+	// for configuration file
+	cRdCfg.Openf(argv[1]);
+	if ( !cRdCfg.ExistFile() ){
+		fprintf(stderr, "Specified file: %s not exist, maybe invalid path or parameter settting.\n", cRdCfg.GetFileName().c_str());
+		iRet = 1;
+		goto INSIDE_MEM_FREE;
+	}	
+
+	iRet = ParseConfig(cRdCfg, sSvcParam, fs);	
+	if ( iRet ){
+		fprintf(stderr, "parse svc parameter config file failed.\n");
+		iRet = 1;
+		goto INSIDE_MEM_FREE;
+	}
+	
+	if ( ParseCommandLine(argc-iParsedNum, argv+iParsedNum, sSvcParam, fs) != 0 )
+	{
+		printf("parse pCommand line failed\n");
+		iRet = 1;
+		goto INSIDE_MEM_FREE;
+	}	
+
+	iTotalFrameMax = (int32_t)sSvcParam.uiFrameToBeCoded;
+	sSvcParam.SUsedPicRect.iLeft = 0;
+	sSvcParam.SUsedPicRect.iTop = 0;
+//	sSvcParam.max_pic_width	= 
+	sSvcParam.iActualPicWidth =
+	sSvcParam.SUsedPicRect.iWidth = sSvcParam.sDependencyLayers[sSvcParam.iNumDependencyLayer-1].iFrameWidth;
+//	pSvcParam.max_pic_height	= 
+	sSvcParam.iActualPicHeight =
+	sSvcParam.SUsedPicRect.iHeight = sSvcParam.sDependencyLayers[sSvcParam.iNumDependencyLayer-1].iFrameHeight;	
+	
+	if ( cmResultSuccess != pPtrEnc->Initialize((void *)&sSvcParam, INIT_TYPE_CONFIG_BASED) )	// SVC encoder initialization
+	{
+		fprintf( stderr, "SVC encoder Initialize failed\n");
+		iRet = 1;
+		goto INSIDE_MEM_FREE;
+	}
+#if (defined(RUN_SIMULATOR) || defined(WIN32)||defined(_MACH_PLATFORM) || (defined(__GNUC__)))
+	// Inactive with sink with output file handler	
+	if ( fs.strBsFile.length() > 0 ){
+		pFpBs = fopen (fs.strBsFile.c_str(), "wb");
+		if (pFpBs == NULL){
+			fprintf( stderr, "Can not open file (%s) to write bitstream!\n", fs.strBsFile.c_str() );
+			iRet = 1;
+			goto INSIDE_MEM_FREE;
+		}
+	}
+#endif	
+	
+#if defined(COMPARE_DATA)
+	//For getting the golden file handle	
+	if((fpGolden = fopen(argv[3], "rb")) == NULL) 
+	{
+		fprintf(stderr, "Unable to open golden sequence file, check corresponding path!\n");
+		iRet = 1;
+		goto INSIDE_MEM_FREE;
+	}
+#endif
+
+	pSrcPicList = new SSourcePicture * [sSvcParam.iNumDependencyLayer];		
+	while (iDlayerIdx < sSvcParam.iNumDependencyLayer) {
+		SDLayerParam *pDLayer = &sSvcParam.sDependencyLayers[iDlayerIdx];			
+		const int kiPicResSize = pDLayer->iFrameWidth * pDLayer->iFrameHeight;
+		SSourcePicture * pSrcPic = new SSourcePicture;
+		if( pSrcPic == NULL ){
+			iRet = 1;
+			goto INSIDE_MEM_FREE;
+		}
+		memset(pSrcPic, 0, sizeof(SSourcePicture));
+		
+		pYUV[iDlayerIdx] = new uint8_t [(3*kiPicResSize)>>1];
+		if (pYUV[iDlayerIdx] == NULL)
+		{
+			iRet = 1;
+			goto INSIDE_MEM_FREE;
+		}
+
+		pSrcPic->iColorFormat = videoFormatI420;
+		pSrcPic->iPicWidth = pDLayer->iFrameWidth;
+		pSrcPic->iPicHeight = pDLayer->iFrameHeight;
+		pSrcPic->iStride[0] = pDLayer->iFrameWidth;
+		pSrcPic->iStride[1] = pSrcPic->iStride[2] = pDLayer->iFrameWidth >> 1;
+
+		pSrcPicList[iDlayerIdx] = pSrcPic;		
+
+		pFileYUV[iDlayerIdx]	= fopen( fs.sSpatialLayers[iDlayerIdx].strSeqFile.c_str(), "rb");
+		if (pFileYUV[iDlayerIdx] != NULL){
+			if( !fseek( pFileYUV[iDlayerIdx], 0, SEEK_END ) )
+			{
+				int64_t i_size = ftell( pFileYUV[iDlayerIdx] );
+				fseek( pFileYUV[iDlayerIdx], 0, SEEK_SET );
+				iTotalFrameMax = WELS_MAX( (int32_t)(i_size / ((3*kiPicResSize)>>1) ), iTotalFrameMax );
+			}
+		}
+		else{
+			fprintf(stderr, "Unable to open source sequence file (%s), check corresponding path!\n", fs.sSpatialLayers[iDlayerIdx].strSeqFile.c_str());
+			iRet = 1;
+			goto INSIDE_MEM_FREE;
+		}			
+
+		++ iDlayerIdx;
+	}
+	
+	iFrameIdx = 0;
+	while (iFrameIdx < iTotalFrameMax && (((int32_t)sSvcParam.uiFrameToBeCoded <= 0) || (iFrameIdx < (int32_t)sSvcParam.uiFrameToBeCoded)) ) {
+		bool_t bOnePicAvailableAtLeast = false;
+		bool_t bSomeSpatialUnavailable	  = false;
+
+#ifdef ONLY_ENC_FRAMES_NUM
+		// Only encoded some limited frames here
+		if ( iActualFrameEncodedCount >= ONLY_ENC_FRAMES_NUM )
+		{
+			break;
+		}
+#endif//ONLY_ENC_FRAMES_NUM
+
+		iDlayerIdx = 0;
+        int  nSpatialLayerNum = 0;
+		while (iDlayerIdx < sSvcParam.iNumDependencyLayer) {
+			SDLayerParam * pDLayer = &sSvcParam.sDependencyLayers[iDlayerIdx];
+			const int kiPicResSize = ((pDLayer->iFrameWidth * pDLayer->iFrameHeight)*3)>>1;			
+			uint32_t uiSkipIdx = (1 << pDLayer->iTemporalResolution);
+			
+			bool_t bCanBeRead= false;
+
+			if ( iFrameIdx % uiSkipIdx == 0 )	// such layer is enabled to encode indeed
+			{				
+				bCanBeRead = (fread(pYUV[iDlayerIdx], 1, kiPicResSize, pFileYUV[iDlayerIdx]) == kiPicResSize);
+				
+				if ( bCanBeRead )
+				{										
+					bOnePicAvailableAtLeast	= true;					
+
+					pSrcPicList[nSpatialLayerNum]->pData[0] = pYUV[iDlayerIdx];
+					pSrcPicList[nSpatialLayerNum]->pData[1] = pSrcPicList[nSpatialLayerNum]->pData[0] +
+						(pDLayer->iFrameWidth * pDLayer->iFrameHeight);
+					pSrcPicList[nSpatialLayerNum]->pData[2] = pSrcPicList[nSpatialLayerNum]->pData[1] + 
+						((pDLayer->iFrameWidth * pDLayer->iFrameHeight)>>2);
+
+					pSrcPicList[nSpatialLayerNum]->iPicWidth = pDLayer->iFrameWidth;
+					pSrcPicList[nSpatialLayerNum]->iPicHeight = pDLayer->iFrameHeight;
+					pSrcPicList[nSpatialLayerNum]->iStride[0] = pDLayer->iFrameWidth;
+					pSrcPicList[nSpatialLayerNum]->iStride[1] = pSrcPicList[nSpatialLayerNum]->iStride[2]
+					  = pDLayer->iFrameWidth >> 1;
+
+					++ nSpatialLayerNum;
+				}
+				else	// file end while reading
+				{
+					bSomeSpatialUnavailable = true;
+					break;
+				}
+			}
+			else
+			{					
+				
+			}		
+			
+			++ iDlayerIdx;			
+		}
+
+		if ( bSomeSpatialUnavailable )
+			break;
+
+		if ( !bOnePicAvailableAtLeast ){
+			++ iFrameIdx;
+			continue;
+		}		
+		
+		// To encoder this frame
+		iStart	= WelsTime();			
+		int iEncFrames = pPtrEnc->EncodeFrame(const_cast<const SSourcePicture**>(pSrcPicList), nSpatialLayerNum, &sFbi);
+		iTotal += WelsTime() - iStart;		
+
+		// fixed issue in case dismatch source picture introduced by frame skipped, 1/12/2010
+		if ( videoFrameTypeSkip == iEncFrames )
+		{
+			continue;
+		}
+
+		if ( iEncFrames != videoFrameTypeInvalid && iEncFrames != videoFrameTypeSkip )
+		{
+			int iLayer = 0;
+			int iFrameSize = 0;
+			while ( iLayer < sFbi.iLayerNum ){
+				SLayerBSInfo *pLayerBsInfo = &sFbi.sLayerInfo[iLayer];
+				if ( pLayerBsInfo != NULL ){
+					int iLayerSize = 0;
+					int iNalIdx = pLayerBsInfo->iNalCount -1;
+					do {
+						iLayerSize += pLayerBsInfo->iNalLengthInByte[iNalIdx];
+						-- iNalIdx;
+					} while(iNalIdx >= 0);
+#if defined(COMPARE_DATA)
+						//Comparing the result of encoder with golden pData
+                        {
+							unsigned char *pUCArry = new unsigned char [iLayerSize];
+							
+							fread(pUCArry, 1, iLayerSize, fpGolden);
+
+							for (int w=0; w<iLayerSize; w++) {
+								if (pUCArry[w] != pLayerBsInfo->pBsBuf[w]) {
+									fprintf(stderr, "error @frame%d/layer%d/byte%d!!!!!!!!!!!!!!!!!!!!!!!!\n", iFrameIdx, iLayer, w);
+									//fprintf(stderr, "%x - %x\n", pUCArry[w], pLayerBsInfo->pBsBuf[w]);									
+									break;
+								}
+							}
+							fprintf( stderr, "frame%d/layer%d comparation completed!\n", iFrameIdx, iLayer);
+							
+							delete [] pUCArry;
+						} 
+#endif
+#if (defined(RUN_SIMULATOR) || defined(WIN32)||defined(_MACH_PLATFORM) || (defined(__GNUC__)))
+					fwrite(pLayerBsInfo->pBsBuf, 1, iLayerSize, pFpBs);	// write pure bit stream into file
+#endif					
+					iFrameSize += iLayerSize;
+				}
+				++ iLayer;
+			}
+#if defined (STICK_STREAM_SIZE)
+			if ( fTrackStream ){
+				fwrite( &iFrameSize, 1, sizeof(int), fTrackStream );
+			}
+#endif//STICK_STREAM_SIZE
+			++ iActualFrameEncodedCount;	// excluding skipped frame time
+		}
+		else{
+			fprintf(stderr, "EncodeFrame(), ret: %d, frame index: %d.\n", iEncFrames, iFrameIdx);
+		}
+
+		++ iFrameIdx;
+	}
+
+	if (iActualFrameEncodedCount > 0){
+		double dElapsed = iTotal / 1e6;
+		printf( "Width:		%d\nHeight:		%d\nFrames:		%d\nencode time:	%f sec\nFPS:		%f fps\n",
+			sSvcParam.iActualPicWidth, sSvcParam.iActualPicHeight,
+			iActualFrameEncodedCount, dElapsed, (iActualFrameEncodedCount * 1.0)/dElapsed );
+	}	
+
+INSIDE_MEM_FREE:
+	{
+#if (defined(RUN_SIMULATOR) || defined(WIN32)||defined(_MACH_PLATFORM) || (defined(__GNUC__)))
+	if (pFpBs)
+	{
+		fclose(pFpBs);
+		pFpBs = NULL;
+	}
+#endif
+#if defined (STICK_STREAM_SIZE)
+	if ( fTrackStream ){
+		fclose( fTrackStream );
+		fTrackStream = NULL;
+	}
+#endif
+#if defined (COMPARE_DATA)	
+	if ( fpGolden ){
+		fclose(fpGolden);
+		fpGolden = NULL;
+	}  
+#endif
+	// Destruction memory introduced in this routine
+	iDlayerIdx = 0;	
+	while (iDlayerIdx < sSvcParam.iNumDependencyLayer)
+	{
+		if (pFileYUV[iDlayerIdx] != NULL){
+			fclose(pFileYUV[iDlayerIdx]);
+			pFileYUV[iDlayerIdx] = NULL;
+		}
+		++ iDlayerIdx;		
+	}	
+
+	if( pSrcPicList ){
+		for( int32_t i=0;i<sSvcParam.iNumDependencyLayer;i++ )
+		{
+			if( pSrcPicList[i] ){
+				delete pSrcPicList[i];
+				pSrcPicList[i] = NULL;
+			}
+		}
+		delete pSrcPicList;
+		pSrcPicList = NULL;
+	}
+
+	for( int32_t i=0;i<MAX_DEPENDENCY_LAYER;i++ ){
+		if( pYUV[i] ){
+			delete [] pYUV[i];
+			pYUV[i] = NULL;
+		}
+	}
+	}
+
+	return iRet;
+}
+
+//  Merge from Heifei's Wonder.  Lock process to a single core
+void LockToSingleCore()
+{  
+#ifdef _MSC_VER
+	//for 2005 compiler, change "DWORD" to "DWORD_PTR"
+	DWORD ProcessAffMask = 0, SystemAffMask = 0;
+	HANDLE hProcess = GetCurrentProcess();
+
+	GetProcessAffinityMask(hProcess, &ProcessAffMask, &SystemAffMask);
+	if (ProcessAffMask > 1)
+	{
+		// more than one CPU core available. Fix to only one:
+		if (ProcessAffMask & 2) 
+		{
+			ProcessAffMask = 2;
+		}
+		else 
+		{
+			ProcessAffMask = 1;
+		}
+		// Lock process to a single CPU core
+		SetProcessAffinityMask(hProcess, ProcessAffMask);
+	}
+
+	// set high priority to avoid interrupts during test
+	SetPriorityClass(hProcess, REALTIME_PRIORITY_CLASS);
+#endif
+	return ;
+}
+
+long CreateSVCEncHandle(ISVCEncoder** ppEncoder)
+{
+	long ret = 0;
+#if defined(MACOS)
+	ret = WelsEncBundleLoad();
+	WelsEncBundleCreateEncoder(ppEncoder);
+#else
+	ret = CreateSVCEncoder( ppEncoder );
+#endif//MACOS
+	return ret;
+}
+
+void DestroySVCEncHanlde(ISVCEncoder* pEncoder)
+{
+	if (pEncoder)
+	{
+#if defined(MACOS)
+		WelsEncBundleDestroyEncoder(pEncoder);
+#else
+		DestroySVCEncoder( pEncoder );
+#endif//MACOS
+
+	}
+}
+
+/****************************************************************************
+ * main:
+ ****************************************************************************/
+#if (defined(MACOS))
+int main_demo( int argc, char **argv )
+#else
+int main( int argc, char **argv )
+#endif
+{	
+	ISVCEncoder* pSVCEncoder	= NULL;
+    FILE *pFileOut					= NULL; 
+    FILE *pFileIn					= NULL;
+	int iRet					= 0;
+	
+#ifdef _MSC_VER
+	_setmode(_fileno(stdin), _O_BINARY);    /* thanks to Marcoss Morais <morais at dee.ufcg.edu.br> */
+	_setmode(_fileno(stdout), _O_BINARY);
+
+	// remove the LOCK_TO_SINGLE_CORE micro, user need to enable it with manual  
+	// LockToSingleCore();
+#endif
+
+	/* Control-C handler */
+	signal( SIGINT, SigIntHandler );
+
+	iRet = CreateSVCEncHandle( &pSVCEncoder );
+	if ( iRet )
+	{
+		cout << "CreateSVCEncoder() failed!!" << endl;		
+		goto exit;
+	}
+
+	if (argc < 2)
+	{
+		goto exit;
+	}
+	else
+	{
+		string	strCfgFileName = argv[1];
+		basic_string <char>::size_type index;
+		static const basic_string <char>::size_type npos = size_t(-1);
+		index = strCfgFileName.rfind(".cfg");	// check configuration type (like .cfg?)
+		if ( index == npos )
+		{
+			if (argc > 2)
+			{
+				iRet = ProcessEncodingSvcWithParam( pSVCEncoder, argc, argv );
+				if ( iRet != 0 )
+					goto exit;
+			}
+			else
+			{
+				cout << "You specified pCommand is invalid!!" << endl;
+				goto exit;
+			}
+		}
+		else
+		{
+			iRet = ProcessEncodingSvcWithConfig( pSVCEncoder, argc, argv);
+			if (iRet > 0)
+				goto exit;
+		}
+	}
+
+	DestroySVCEncHanlde( pSVCEncoder );
+	return 0;
+
+exit:
+	DestroySVCEncHanlde( pSVCEncoder );
+	PrintHelp();
+	return 1;
+}
--- /dev/null
+++ b/codec/decoder/core/asm/asm_inc.asm
@@ -1,0 +1,235 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  sse2inc.asm
+;*
+;*  Abstract
+;*      macro and constant
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+;***********************************************************************
+; Options, for DEBUG
+;***********************************************************************
+
+%if 1 
+	%define MOVDQ movdqa
+%else
+	%define MOVDQ movdqu
+%endif
+
+%if 1
+	%define WELSEMMS	emms
+%else
+	%define WELSEMMS
+%endif
+
+BITS 32
+
+;***********************************************************************
+; Macros 
+;***********************************************************************
+
+%macro WELS_EXTERN 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro WELS_AbsW 2
+	pxor        %2, %2
+    psubw       %2, %1
+    pmaxsw      %1, %2
+%endmacro 	
+
+%macro MMX_XSwap  4
+    movq		%4, %2
+    punpckh%1   %4, %3
+    punpckl%1   %2, %3
+%endmacro
+
+; pOut mm1, mm4, mm5, mm3
+%macro MMX_Trans4x4W 5
+    MMX_XSwap wd, %1, %2, %5
+    MMX_XSwap wd, %3, %4, %2
+    MMX_XSwap dq, %1, %3, %4
+    MMX_XSwap dq, %5, %2, %3
+%endmacro
+
+;for TRANSPOSE
+%macro SSE2_XSawp 4
+    movdqa      %4, %2
+    punpckl%1   %2, %3
+    punpckh%1   %4, %3
+%endmacro
+
+; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
+%macro SSE2_Trans4x4D 5
+    SSE2_XSawp dq,  %1, %2, %5
+    SSE2_XSawp dq,  %3, %4, %2
+    SSE2_XSawp qdq, %1, %3, %4
+    SSE2_XSawp qdq, %5, %2, %3
+%endmacro
+
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+%macro SSE2_TransTwo4x4W 5
+    SSE2_XSawp wd,  %1, %2, %5
+    SSE2_XSawp wd,  %3, %4, %2
+    SSE2_XSawp dq,  %1, %3, %4
+    SSE2_XSawp dq,  %5, %2, %3
+    SSE2_XSawp qdq, %1, %5, %2
+    SSE2_XSawp qdq, %4, %3, %5
+%endmacro
+
+;in:  m1, m2, m3, m4, m5, m6, m7, m8
+;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+%macro SSE2_TransTwo8x8B 9
+	movdqa	%9,	%8
+	SSE2_XSawp bw,  %1, %2, %8
+	SSE2_XSawp bw,  %3, %4, %2
+	SSE2_XSawp bw,  %5, %6, %4
+	movdqa	%6, %9
+	movdqa	%9, %4
+	SSE2_XSawp bw,  %7, %6, %4
+	
+	SSE2_XSawp wd,  %1, %3, %6	
+	SSE2_XSawp wd,  %8, %2, %3
+	SSE2_XSawp wd,  %5, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %3	
+	SSE2_XSawp wd,  %7, %4, %3
+	
+	SSE2_XSawp dq,  %1, %5, %4	
+	SSE2_XSawp dq,  %6, %2, %5
+	SSE2_XSawp dq,  %8, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %5		
+	SSE2_XSawp dq,  %7, %3, %5
+	
+	SSE2_XSawp qdq,  %1, %8, %3
+	SSE2_XSawp qdq,  %4, %2, %8
+	SSE2_XSawp qdq,  %6, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %1		
+	SSE2_XSawp qdq,  %7, %5, %1
+	movdqa	%5, %9
+%endmacro
+
+;xmm0, xmm6, xmm7, [eax], [ecx]
+;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
+%macro SSE2_LoadDiff8P 5
+    movq         %1, %4
+    punpcklbw    %1, %3
+    movq         %2, %5
+    punpcklbw    %2, %3
+    psubw        %1, %2
+%endmacro
+
+; m2 = m1 + m2, m1 = m1 - m2
+%macro SSE2_SumSub 3
+	movdqa  %3, %2
+    paddw   %2, %1
+    psubw   %1, %3
+%endmacro
+
+
+%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+	mov %3h, %3l
+	movd %1, e%3x		; i.e, 1% = eax (=b0)
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
+%endmacro  
+
+;copy a dw into a xmm for 8 times
+%macro  SSE2_Copy8Times 2
+		movd	%1, %2
+		punpcklwd %1, %1
+		pshufd	%1,	%1,	0
+%endmacro
+
+;copy a db into a xmm for 16 times
+%macro  SSE2_Copy16Times 2
+		movd		%1, %2
+		pshuflw		%1, %1, 0
+		punpcklqdq	%1, %1
+		packuswb	%1,	%1
+%endmacro
+
+
+
+;***********************************************************************
+;preprocessor constants
+;***********************************************************************
+;dw 32,32,32,32,32,32,32,32 for xmm
+;dw 32,32,32,32 for mm
+%macro WELS_DW32 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	psllw %1,5
+%endmacro
+
+;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
+;dw 1, 1, 1, 1 for mm
+%macro WELS_DW1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+%endmacro
+
+;all 0 for xmm and mm
+%macro	WELS_Zero 1
+	pxor %1, %1
+%endmacro
+
+;dd 1, 1, 1, 1 for xmm
+;dd 1, 1 for mm
+%macro WELS_DD1 1
+	pcmpeqw %1,%1
+	psrld %1,31
+%endmacro
+
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+%macro WELS_DB1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	packuswb %1,%1
+%endmacro
+
+
+
+
+
+
--- /dev/null
+++ b/codec/decoder/core/asm/block_add.asm
@@ -1,0 +1,413 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  block_add.asm
+;*
+;*  Abstract
+;*      add block
+;*
+;*  History
+;*      09/21/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include  "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%macro   BLOCK_ADD_16_SSE2   4 
+	movdqa    xmm0,       [%2]
+	movdqa    xmm1,       [%3]
+    movdqa    xmm2,       [%3+10h]
+	movdqa    xmm6,       xmm0
+
+	punpcklbw    xmm0,    xmm7
+	punpckhbw    xmm6,    xmm7
+
+	paddw        xmm0,    xmm1
+	paddw        xmm6,    xmm2
+
+	packuswb     xmm0,    xmm6
+	movdqa       [%1],    xmm0
+
+	lea          %2,      [%2+%4]
+	lea          %3,      [%3+%4*2]
+	lea          %1,      [%1+%4] 
+%endmacro
+
+%macro    BLOCK_ADD_8_MMXEXT   4
+    movq       mm0,       [%2]
+	movq       mm1,       [%3]
+	movq       mm2,       [%3+08h]
+	movq       mm6,       mm0
+
+	punpcklbw    mm0,     mm7
+	punpckhbw    mm6,     mm7
+
+	paddw        mm0,     mm1
+	paddw        mm6,     mm2
+
+	packuswb     mm0,     mm6
+	movq         [%1],    mm0
+
+	lea          %2,      [%2+%4]
+	lea          %3,      [%3+%4*2]
+	lea          %1,      [%1+%4]
+%endmacro
+
+
+%macro    BLOCK_ADD_16_STRIDE_SSE2  5
+    movdqa    xmm0,       [%2]
+	movdqa    xmm1,       [%3]
+    movdqa    xmm2,       [%3+10h]
+	movdqa    xmm6,       xmm0
+
+	punpcklbw    xmm0,    xmm7
+	punpckhbw    xmm6,    xmm7
+
+	paddw        xmm0,    xmm1
+	paddw        xmm6,    xmm2
+
+	packuswb     xmm0,    xmm6
+	movdqa       [%1],    xmm0
+
+	lea          %2,      [%2+%4]
+	lea          %3,      [%3+%5*2]
+	lea          %1,      [%1+%4] 
+%endmacro
+
+
+%macro    BLOCK_ADD_8_STRIDE_MMXEXT   5
+    movq       mm0,       [%2]
+	movq       mm1,       [%3]
+	movq       mm2,       [%3+08h]
+	movq       mm6,       mm0
+
+	punpcklbw    mm0,     mm7
+	punpckhbw    mm6,     mm7
+
+	paddw        mm0,     mm1
+	paddw        mm6,     mm2
+
+	packuswb     mm0,     mm6
+	movq         [%1],    mm0
+
+	lea          %2,      [%2+%4]
+	lea          %3,      [%3+%5*2]
+	lea          %1,      [%1+%4]
+%endmacro
+
+%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5    
+	movdqa xmm1, [%3]
+	movq xmm0, [%2]
+	punpcklbw xmm0, xmm7
+	paddw xmm0, xmm1
+	packuswb xmm0, xmm7
+	movq [%1], xmm0	
+	
+	movdqa xmm3, [%3+%5*2]
+	movq xmm2, [%2+%4]
+	punpcklbw xmm2, xmm7
+	paddw xmm2, xmm3
+	packuswb xmm2, xmm7	
+	movq [%1+%4], xmm2	
+	
+	lea %1, [%1+%4*2]
+	lea %2, [%2+%4*2]
+	lea %3, [%3+%5*4]	
+%endmacro
+
+%macro   CHECK_DATA_16_ZERO_SSE4     3
+    mov        eax,      0h
+	movdqa     xmm0,     [%1]
+	movdqa     xmm1,     [%1+10h]
+	mov        ebx,       [ecx]
+
+	por		   xmm0,	 xmm1
+	ptest      xmm7,     xmm0
+	cmovae     eax,      %3
+	
+	add        %1,       20h
+	add        ecx,      04h
+	mov        byte [%2+ebx],  al
+%endmacro
+
+%macro  CHECK_RS_4x4_BLOCK_2_ZERO_SSE4   5
+    movdqa     xmm0,      [%1]
+    movdqa     xmm1,      [%1+%3]
+    movdqa     xmm2,      [%1+%3*2]
+    movdqa     xmm3,      [%1+%4]
+    
+    mov        eax,       0h
+    mov        ebx,       0h
+    movdqa     xmm4,      xmm0
+    movdqa     xmm5,      xmm2
+    
+    punpcklqdq  xmm0,     xmm1
+    punpckhqdq  xmm4,     xmm1
+    punpcklqdq  xmm2,     xmm3
+    punpckhqdq  xmm5,     xmm3
+
+	por			xmm0,	  xmm2
+	por			xmm4,	  xmm5
+    
+    ptest       xmm7,     xmm0
+    cmovae      eax,      %5
+    ptest       xmm7,     xmm4
+    cmovae      ebx,      %5    
+    
+    mov     byte [%2],    al
+    mov     byte [%2+1],  bl
+%endmacro
+
+%macro   DATA_COPY_16x2_SSE2      3
+    movdqa     xmm0,    [%1]
+	movdqa     xmm1,    [%1+10h]
+	movdqa     xmm2,    [%1+%3]
+	movdqa     xmm3,    [%1+%3+10h]
+
+	movdqa     [%2],    xmm0
+	movdqa     [%2+10h],  xmm1
+	movdqa     [%2+20h],  xmm2
+	movdqa     [%2+30h],  xmm3
+
+	lea        %1,      [%1+%3*2]
+	lea        %2,      [%2+40h]
+%endmacro
+
+
+%macro   DATA_COPY_8x4_SSE2      4
+    movdqa     xmm0,         [%1]
+	movdqa     xmm1,         [%1+%3]
+	movdqa     xmm2,         [%1+%3*2]
+	movdqa     xmm3,         [%1+%4]
+
+	movdqa     [%2],         xmm0
+	movdqa     [%2+10h],     xmm1
+	movdqa     [%2+20h],     xmm2
+	movdqa     [%2+30h],     xmm3
+
+	lea        %1,           [%1+%3*4]
+	lea        %2,           [%2+40h]
+%endmacro
+
+
+%macro   CHECK_DATA_16_ZERO_SSE2   3
+    mov        eax,       0h
+    movdqa     xmm0,      [%1]
+    movdqa     xmm1,      [%1+10h]
+    mov        ebx,       [ecx]
+    
+    pcmpeqw    xmm0,      xmm7
+    pcmpeqw    xmm1,      xmm7
+    packsswb   xmm0,      xmm1
+    pmovmskb   edx,       xmm0    
+    sub        edx,       0ffffh
+    
+    cmovb      eax,       ebp   
+    add        ecx,       4
+    add        %1,        20h
+    mov      byte [%2+ebx],    al
+%endmacro
+    
+
+
+%macro   CHECK_RS_4x4_BLOCK_2_ZERO_SSE2    5
+    movdqa    xmm0,      [%1]
+    movdqa    xmm1,      [%1 + %3]
+    movdqa    xmm2,      [%1 + %3*2]
+    movdqa    xmm3,      [%1 + %4]    
+    
+    movdqa    xmm4,       xmm0
+    movdqa    xmm5,       xmm2
+    
+    punpcklqdq   xmm0,    xmm1
+    punpckhqdq   xmm4,    xmm1
+    punpcklqdq   xmm2,    xmm3
+    punpckhqdq   xmm5,    xmm3
+    
+    pcmpeqw      xmm0,    xmm7
+    pcmpeqw      xmm2,    xmm7
+    pcmpeqw      xmm4,    xmm7
+    pcmpeqw      xmm5,    xmm7
+    
+    packsswb     xmm0,    xmm2
+    packsswb     xmm4,    xmm5
+    pmovmskb     eax,     xmm0
+    pmovmskb     ebx,     xmm4
+    
+    sub          eax,     0ffffh
+    mov          eax,     0
+    cmovb        eax,     %5
+    sub          ebx,     0ffffh
+    mov          ebx,     0
+    cmovb        ebx,     %5
+    mov       byte [%2],    al
+    mov       byte [%2+1],  bl        
+%endmacro
+
+;*******************************************************************************
+; Data
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+
+ALIGN  16
+SubMbScanIdx:
+     dd    0x0,  0x1,  0x4,  0x5, 
+	 dd    0x2,  0x3,  0x6,  0x7,
+	 dd    0x8,  0x9,  0xc,  0xd,
+	 dd    0xa,  0xb,  0xe,  0xf,
+	 dd    0x10, 0x11, 0x14, 0x15,
+	 dd    0x12, 0x13, 0x16, 0x17,     
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+WELS_EXTERN   WelsResBlockZero16x16_sse2
+
+ALIGN    16
+;*******************************************************************************
+;  void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
+;*******************************************************************************
+WelsResBlockZero16x16_sse2:
+    push     esi	
+
+	mov      esi,        [esp+08h]
+	mov      ecx,        [esp+0ch]	
+	lea      ecx,        [ecx*2]
+	lea      eax,        [ecx*3]
+
+	pxor     xmm7,       xmm7
+
+    ; four  lines
+	movdqa   [esi],      xmm7
+	movdqa   [esi+10h],  xmm7
+
+	movdqa   [esi+ecx],  xmm7
+	movdqa   [esi+ecx+10h],     xmm7
+
+    movdqa   [esi+ecx*2],   xmm7
+	movdqa   [esi+ecx*2+10h],   xmm7
+
+	movdqa   [esi+eax],     xmm7
+	movdqa   [esi+eax+10h],     xmm7
+
+    ;  four lines
+	lea      esi,       [esi+ecx*4]
+	movdqa   [esi],      xmm7
+	movdqa   [esi+10h],  xmm7
+
+	movdqa   [esi+ecx],  xmm7
+	movdqa   [esi+ecx+10h],     xmm7
+
+    movdqa   [esi+ecx*2],   xmm7
+	movdqa   [esi+ecx*2+10h],   xmm7
+
+	movdqa   [esi+eax],     xmm7
+	movdqa   [esi+eax+10h],     xmm7
+
+	;  four lines
+	lea      esi,       [esi+ecx*4]
+	movdqa   [esi],      xmm7
+	movdqa   [esi+10h],  xmm7
+
+	movdqa   [esi+ecx],  xmm7
+	movdqa   [esi+ecx+10h],     xmm7
+
+    movdqa   [esi+ecx*2],   xmm7
+	movdqa   [esi+ecx*2+10h],   xmm7
+
+	movdqa   [esi+eax],     xmm7
+	movdqa   [esi+eax+10h],     xmm7
+
+	;  four lines
+	lea      esi,       [esi+ecx*4]
+	movdqa   [esi],      xmm7
+	movdqa   [esi+10h],  xmm7
+
+	movdqa   [esi+ecx],  xmm7
+	movdqa   [esi+ecx+10h],     xmm7
+
+    movdqa   [esi+ecx*2],   xmm7
+	movdqa   [esi+ecx*2+10h],   xmm7
+
+	movdqa   [esi+eax],     xmm7
+	movdqa   [esi+eax+10h],     xmm7
+    
+    pop      esi
+	ret
+
+
+WELS_EXTERN   WelsResBlockZero8x8_sse2
+
+ALIGN    16
+;*******************************************************************************
+;  void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
+;*******************************************************************************
+WelsResBlockZero8x8_sse2: 
+	  push      esi
+
+      mov       esi,     [esp+08h]
+	  mov       ecx,     [esp+0ch]
+	  lea       ecx,     [ecx*2]
+	  lea       eax,     [ecx*3]
+
+	  pxor      xmm7,          xmm7
+
+	  movdqa    [esi],         xmm7
+	  movdqa    [esi+ecx],     xmm7
+	  movdqa    [esi+ecx*2],   xmm7
+	  movdqa    [esi+eax],     xmm7
+
+	  lea       esi,     [esi+ecx*4]
+	  movdqa    [esi],         xmm7
+	  movdqa    [esi+ecx],     xmm7
+	  movdqa    [esi+ecx*2],   xmm7
+	  movdqa    [esi+eax],     xmm7
+
+	  
+	  pop       esi
+	  ret
+
--- /dev/null
+++ b/codec/decoder/core/asm/cpuid.asm
@@ -1,0 +1,169 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	cpu_mmx.asm
+;*
+;*  Abstract
+;*		verify cpuid feature support and cpuid detection
+;*
+;*  History
+;*      04/29/2009	Created
+;*
+;*************************************************************************/
+
+bits 32
+
+;******************************************************************************************
+; Macros
+;******************************************************************************************
+
+%macro WELS_EXTERN 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;******************************************************************************************
+; Code
+;******************************************************************************************
+
+SECTION .text
+
+; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
+; section CPUID - CPU Identification
+
+WELS_EXTERN WelsCPUIdVerify
+ALIGN 16
+;******************************************************************************************
+;   int32_t WelsCPUIdVerify()
+;******************************************************************************************
+WelsCPUIdVerify:
+    pushfd					; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
+	pushfd					; need push 2 EFLAGS, one for processing and the another one for storing purpose
+    pop     ecx				; get EFLAGS to bit manipulation
+    mov     eax, ecx		; store into ecx followed
+    xor     eax, 00200000h	; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
+	xor		eax, ecx		; get the ID flag bitwise, eax - 0: not support; otherwise: support
+    popfd					; store back EFLAGS and keep unchanged for system
+    ret
+
+WELS_EXTERN WelsCPUId
+ALIGN 16
+;****************************************************************************************************
+;   void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
+;****************************************************************************************************
+WelsCPUId:
+	push	ebx	
+	push	edi
+	
+	mov     eax, [esp+12]	; operating index
+    cpuid					; cpuid
+	
+	; processing various information return
+	mov     edi, [esp+16]
+    mov     [edi], eax
+    mov     edi, [esp+20]
+    mov     [edi], ebx
+    mov     edi, [esp+24]
+    mov     [edi], ecx
+    mov     edi, [esp+28]
+    mov     [edi], edx
+
+	pop		edi	
+    pop     ebx
+	ret
+	
+WELS_EXTERN WelsCPUSupportAVX
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportAVX:
+	mov eax, [esp+4]
+	mov ecx, [esp+8]
+
+	; refer to detection of AVX addressed in INTEL AVX manual document
+	and ecx, 018000000H
+	cmp ecx, 018000000H		; check both OSXSAVE and AVX feature flags
+	jne avx_not_supported
+	; processor supports AVX instructions and XGETBV is enabled by OS
+	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
+	XGETBV					; result in EDX:EAX
+	and eax, 06H
+	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
+	jne avx_not_supported
+	mov eax, 1
+	ret
+avx_not_supported:
+	mov eax, 0
+	ret
+
+WELS_EXTERN WelsCPUSupportFMA
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportFMA:
+	mov eax, [esp+4]
+	mov ecx, [esp+8]
+	
+	; refer to detection of FMA addressed in INTEL AVX manual document
+	and ecx, 018001000H
+	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
+	jne fma_not_supported
+	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
+	XGETBV					; result in EDX:EAX
+	and eax, 06H
+	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
+	jne fma_not_supported
+	mov eax, 1
+	ret
+fma_not_supported:
+	mov eax, 0	
+	ret
+
+WELS_EXTERN WelsEmms
+ALIGN 16
+;******************************************************************************************
+;   void WelsEmms()
+;******************************************************************************************
+WelsEmms:
+	emms	; empty mmx technology states
+	ret
+
+
+
--- /dev/null
+++ b/codec/decoder/core/asm/dct.asm
@@ -1,0 +1,129 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  dct.asm
+;*
+;*  Abstract
+;*      WelsDctFourT4_sse2
+;*
+;*  History
+;*      8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+    movq    %3, %2
+    psraw   %3, $1
+    paddw   %3, %1
+    psraw   %1, $1
+    psubw   %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+	movq    %3, %2
+    psubw   %2, %1
+    paddw   %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+    MMX_SumSub      %4, %5, %6
+    MMX_SumSubDiv2  %3, %2, %1
+    MMX_SumSub		%1, %4, %6
+	MMX_SumSub		%3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+    movd       %2, %5
+    punpcklbw  %2, %4
+    paddw      %1, %3
+    psraw      %1, $6
+    paddsw     %1, %2
+    packuswb   %1, %2
+    movd       %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN IdctResAddPred_mmx
+
+ALIGN 16
+;*******************************************************************************
+;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+IdctResAddPred_mmx:
+
+%define	pushsize	0
+%define pPred       esp+pushsize+4
+%define kiStride     esp+pushsize+8
+%define pRs         esp+pushsize+12
+
+	mov     eax, [pRs   ] 
+    mov     edx, [pPred ]   
+    mov     ecx, [kiStride]   
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
+	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
+	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+
+    WELS_Zero			mm7
+    WELS_DW32			mm6
+    
+    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
+    lea     edx, [edx+2*ecx]
+    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
+    
+%undef	pushsize
+%undef  pPred
+%undef  kiStride
+%undef  pRs
+	emms
+    ret
--- /dev/null
+++ b/codec/decoder/core/asm/deblock.asm
@@ -1,0 +1,2113 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,68h 
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx] 
+  movq        xmm5,[edx+ecx] 
+  push        esi  
+  push        edi  
+  lea         esi,[edx+edx] 
+  mov         edi,eax 
+  sub         edi,esi 
+  movq        xmm1,[edi] 
+  mov         edi,ecx 
+  sub         edi,esi 
+  movq        xmm2,[edi] 
+  punpcklqdq  xmm1,xmm2 
+  mov         esi,eax 
+  sub         esi,edx 
+  movq        xmm2,[esi] 
+  mov         edi,ecx 
+  sub         edi,edx 
+  movq        xmm3,[edi] 
+  punpcklqdq  xmm2,xmm3 
+  movq        xmm3,[eax] 
+  punpcklqdq  xmm3,xmm4 
+  movq        xmm4,[edx+eax] 
+  mov       edx, [ebp + 14h] 
+  punpcklqdq  xmm4,xmm5 
+  movd        xmm5,edx 
+  mov       edx, [ebp + 18h] 
+  pxor        xmm0,xmm0 
+  movdqa      xmm6,xmm5 
+  punpcklwd   xmm6,xmm5 
+  pshufd      xmm5,xmm6,0 
+  movd        xmm6,edx 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      xmm7,xmm1 
+  punpckhbw   xmm1,xmm0 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+40h],xmm1 
+  movdqa      [esp+60h],xmm7 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+10h],xmm7 
+  movdqa      xmm7,xmm3 
+  punpcklbw   xmm7,xmm0 
+  punpckhbw   xmm3,xmm0 
+  movdqa      [esp+50h],xmm7 
+  movdqa      xmm7,xmm4 
+  punpckhbw   xmm4,xmm0 
+  punpckhbw   xmm2,xmm0 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+30h],xmm3 
+  movdqa      xmm3,[esp+10h] 
+  movdqa      xmm1,xmm3 
+  psubw       xmm1,[esp+50h] 
+  pabsw       xmm1,xmm1 
+  movdqa      [esp+20h],xmm4 
+  movdqa      xmm0,xmm5 
+  pcmpgtw     xmm0,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  pand        xmm0,xmm4 
+  movdqa      xmm1,xmm7 
+  psubw       xmm1,[esp+50h] 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,[esp+30h] 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm5,xmm1 
+  movdqa      xmm1,[esp+40h] 
+  pand        xmm0,xmm4 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  movdqa      xmm1,[esp+20h] 
+  psubw       xmm1,[esp+30h] 
+  pand        xmm5,xmm4 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm6,xmm1 
+  pand        xmm5,xmm6 
+  mov         edx,2 
+  movsx       edx,dx 
+  movd        xmm1,edx 
+  movdqa      xmm4,xmm1 
+  punpcklwd   xmm4,xmm1 
+  pshufd      xmm1,xmm4,0 
+  movdqa      xmm4,[esp+60h] 
+  movdqa      xmm6,xmm4 
+  paddw       xmm6,xmm4 
+  paddw       xmm6,xmm3 
+  paddw       xmm6,xmm7 
+  movdqa      [esp+10h],xmm1 
+  paddw       xmm6,[esp+10h] 
+  psraw       xmm6,2 
+  movdqa      xmm4,xmm0 
+  pandn       xmm4,xmm3 
+  movdqa      xmm3,[esp+40h] 
+  movdqa      xmm1,xmm0 
+  pand        xmm1,xmm6 
+  por         xmm1,xmm4 
+  movdqa      xmm6,xmm3 
+  paddw       xmm6,xmm3 
+  movdqa      xmm3,[esp+10h] 
+  paddw       xmm6,xmm2 
+  paddw       xmm6,[esp+20h] 
+  paddw       xmm6,xmm3 
+  psraw       xmm6,2 
+  movdqa      xmm4,xmm5 
+  pand        xmm4,xmm6 
+  movdqa      xmm6,xmm5 
+  pandn       xmm6,xmm2 
+  por         xmm4,xmm6 
+  packuswb    xmm1,xmm4 
+  movdqa      xmm4,[esp+50h] 
+  movdqa      xmm6,xmm7 
+  paddw       xmm6,xmm7 
+  paddw       xmm6,xmm4 
+  paddw       xmm6,[esp+60h] 
+  paddw       xmm6,xmm3 
+  psraw       xmm6,2 
+  movdqa      xmm2,xmm0 
+  pand        xmm2,xmm6 
+  pandn       xmm0,xmm4 
+  por         xmm2,xmm0 
+  movdqa      xmm0,[esp+20h] 
+  movdqa      xmm6,xmm0 
+  paddw       xmm6,xmm0 
+  movdqa      xmm0,[esp+30h] 
+  paddw       xmm6,xmm0 
+  paddw       xmm6,[esp+40h] 
+  movdqa      xmm4,xmm5 
+  paddw       xmm6,xmm3 
+  movq        [esi],xmm1 
+  psraw       xmm6,2 
+  pand        xmm4,xmm6 
+  pandn       xmm5,xmm0 
+  por         xmm4,xmm5 
+  packuswb    xmm2,xmm4 
+  movq        [eax],xmm2 
+  psrldq      xmm1,8 
+  movq        [edi],xmm1 
+  pop         edi  
+  psrldq      xmm2,8 
+  movq        [ecx],xmm2 
+  pop         esi  
+  mov         esp,ebp 
+  pop         ebp  
+  ret              
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,0E4h 
+  push        ebx  
+  push        esi  
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2] 
+  push        edi  
+  movsx       di,byte [esi+3] 
+  mov         word [esp+0Ch],bx 
+  movsx       bx,byte  [esi+1] 
+  movsx       esi,byte  [esi] 
+  mov         word  [esp+0Eh],si 
+  movzx       esi,di 
+  movd        xmm1,esi 
+  movzx       esi,di 
+  movd        xmm2,esi 
+  mov         si,word  [esp+0Ch] 
+  mov         edx, [ebp + 10h] 
+  mov         eax, [ebp + 08h] 
+  movzx       edi,si 
+  movzx       esi,si 
+  mov         ecx, [ebp + 0Ch] 
+  movd        xmm4,esi 
+  movzx       esi,bx 
+  movd        xmm5,esi 
+  movd        xmm3,edi 
+  movzx       esi,bx 
+  movd        xmm6,esi 
+  mov         si,word [esp+0Eh] 
+  movzx       edi,si 
+  movzx       esi,si 
+  punpcklwd   xmm6,xmm2 
+  pxor        xmm0,xmm0 
+  movdqa      [esp+40h],xmm0 
+  movd        xmm7,edi 
+  movd        xmm0,esi 
+  lea         esi,[edx+edx] 
+  mov         edi,eax 
+  sub         edi,esi 
+  punpcklwd   xmm5,xmm1 
+  movdqa      xmm1,[esp+40h] 
+  punpcklwd   xmm0,xmm4 
+  movq        xmm4,[edx+ecx] 
+  punpcklwd   xmm7,xmm3 
+  movq        xmm3,[eax] 
+  punpcklwd   xmm0,xmm6 
+  movq        xmm6,[edi] 
+  punpcklwd   xmm7,xmm5 
+  punpcklwd   xmm0,xmm7 
+  mov         edi,ecx 
+  sub         edi,esi 
+  movdqa      xmm2,xmm1 
+  psubw       xmm2,xmm0 
+  movdqa      [esp+60h],xmm2 
+  movq        xmm2, [edi] 
+  punpcklqdq  xmm6,xmm2 
+  mov         esi,eax 
+  sub         esi,edx 
+  movq        xmm7,[esi] 
+  mov         edi,ecx 
+  sub         edi,edx 
+  movq        xmm2,[edi] 
+  punpcklqdq  xmm7,xmm2 
+  movq        xmm2,[ecx] 
+  punpcklqdq  xmm3,xmm2 
+  movq        xmm2,[edx+eax] 
+  movsx       edx,word [ebp + 14h] 
+  punpcklqdq  xmm2,xmm4 
+  movdqa      [esp+0E0h],xmm2 
+  movd        xmm2,edx 
+  movsx       edx,word [ebp + 18h] 
+  movdqa      xmm4,xmm2 
+  punpcklwd   xmm4,xmm2 
+  movd        xmm2,edx 
+  movdqa      xmm5,xmm2 
+  punpcklwd   xmm5,xmm2 
+  pshufd      xmm2,xmm5,0 
+  movdqa      [esp+50h],xmm2 
+  movdqa      xmm2,xmm6 
+  punpcklbw   xmm2,xmm1 
+  movdqa      [esp+0D0h],xmm3 
+  pshufd      xmm4,xmm4,0 
+  movdqa      [esp+30h],xmm2 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+80h],xmm6 
+  movdqa      xmm6,[esp+0D0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+70h],xmm6 
+  movdqa      xmm6, [esp+0E0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa     [esp+90h],xmm6 
+  movdqa      xmm5, [esp+0E0h] 
+  movdqa      xmm2,xmm7 
+  punpckhbw   xmm7,xmm1 
+  punpcklbw   xmm5,xmm1 
+  movdqa       [esp+0A0h],xmm7 
+  punpcklbw   xmm3,xmm1 
+  mov         edx,4 
+  punpcklbw   xmm2,xmm1 
+  movsx       edx,dx 
+  movd        xmm6,edx 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      xmm7,[esp+30h] 
+  movdqa      [esp+20h],xmm6 
+  psubw       xmm7,xmm5 
+  movdqa      xmm6,xmm0 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  movdqa      [esp+40h],xmm6 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm2 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm7 
+  paddw       xmm6, [esp+20h] 
+  movdqa      xmm7, [esp+50h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm1,xmm6 
+  movdqa      [esp+10h],xmm0 
+  movdqa      xmm6, [esp+10h] 
+  pminsw      xmm6,xmm1 
+  movdqa      [esp+10h],xmm6 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm6,xmm4 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1, [esp+30h] 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm7,xmm1 
+  movdqa      xmm1,[esp+50h] 
+  pand        xmm6,xmm7 
+  movdqa      xmm7,[esp+50h] 
+  psubw       xmm5,xmm3 
+  pabsw       xmm5,xmm5 
+  pcmpgtw     xmm1,xmm5 
+  movdqa      xmm5,[esp+80h] 
+  psubw       xmm5,[esp+90h] 
+  pand        xmm6,xmm1 
+  pand        xmm6,[esp+40h] 
+  movdqa      xmm1,[esp+10h] 
+  pand        xmm1,xmm6 
+  movdqa      xmm6,[esp+70h] 
+  movdqa      [esp+30h],xmm1 
+  movdqa      xmm1,[esp+0A0h] 
+  psubw       xmm6,xmm1 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm5 
+  paddw       xmm6,[esp+20h] 
+  movdqa      xmm5,[esp+60h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm5,xmm6 
+  pminsw      xmm0,xmm5 
+  movdqa      xmm5,[esp+70h] 
+  movdqa      xmm6,xmm1 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm4,xmm6 
+  movdqa      xmm6,[esp+80h] 
+  psubw       xmm6,xmm1 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+90h] 
+  pand        xmm4,xmm7 
+  movdqa      xmm7,[esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  pand        xmm4,xmm7 
+  pand        xmm4,[esp+40h] 
+  pand        xmm0,xmm4 
+  movdqa      xmm4,[esp+30h] 
+  paddw       xmm2,xmm4 
+  paddw       xmm1,xmm0 
+  packuswb    xmm2,xmm1 
+  movq        [esi],xmm2 
+  psubw       xmm3,xmm4 
+  psubw       xmm5,xmm0 
+  packuswb    xmm3,xmm5 
+  movq        [eax],xmm3 
+  psrldq      xmm2,8 
+  movq        [edi],xmm2 
+  pop         edi  
+  pop         esi  
+  psrldq      xmm3,8 
+  movq        [ecx],xmm3 
+  pop         ebx  
+  mov         esp,ebp 
+  pop         ebp  
+  ret    
+  
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+  
+DeblockChromaEq4H_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,0C8h  
+  mov         ecx,dword [ebp+8] 
+  mov         edx,dword [ebp+0Ch] 
+  mov         eax,dword [ebp+10h] 
+  sub         ecx,2 
+  sub         edx,2 
+  push        esi  
+  lea         esi,[eax+eax*2] 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+4],edx 
+  lea         ecx,[ecx+eax*4] 
+  lea         edx,[edx+eax*4] 
+  lea         eax,[esp+7Ch] 
+  push        edi  
+  mov         dword [esp+14h],esi 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+0Ch],edx 
+  mov         dword [esp+10h],eax 
+  mov         esi,dword [esp+1Ch] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+14h] 
+  movd        xmm0,dword [esi] 
+  movd        xmm1,dword [esi+ecx] 
+  movd        xmm2,dword [esi+ecx*2] 
+  movd        xmm3,dword [esi+edx] 
+  mov         esi,dword  [esp+8] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [esi+ecx] 
+  movd        xmm6,dword [esi+ecx*2] 
+  movd        xmm7,dword [esi+edx] 
+  punpckldq   xmm0,xmm4 
+  punpckldq   xmm1,xmm5 
+  punpckldq   xmm2,xmm6 
+  punpckldq   xmm3,xmm7 
+  mov         esi,dword [esp+18h] 
+  mov         edi,dword [esp+0Ch] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [edi] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm0,xmm4 
+  movd        xmm4,dword [esi+ecx] 
+  movd        xmm5,dword [edi+ecx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm1,xmm4 
+  movd        xmm4,dword [esi+ecx*2] 
+  movd        xmm5,dword [edi+ecx*2] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm2,xmm4 
+  movd        xmm4,dword [esi+edx] 
+  movd        xmm5,dword [edi+edx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm3,xmm4 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         edi,dword [esp+10h] 
+  movdqa      [edi],xmm0 
+  movdqa      [edi+10h],xmm5 
+  movdqa      [edi+20h],xmm1 
+  movdqa      [edi+30h],xmm6 
+  movsx       ecx,word [ebp+14h] 
+  movsx       edx,word [ebp+18h] 
+  movdqa      xmm6,[esp+80h] 
+  movdqa      xmm4,[esp+90h] 
+  movdqa      xmm5,[esp+0A0h] 
+  movdqa      xmm7,[esp+0B0h] 
+  pxor        xmm0,xmm0 
+  movd        xmm1,ecx 
+  movdqa      xmm2,xmm1 
+  punpcklwd   xmm2,xmm1 
+  pshufd      xmm1,xmm2,0 
+  movd        xmm2,edx 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm2,xmm3,0 
+  movdqa      xmm3,xmm6 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+60h],xmm6 
+  movdqa      xmm6,[esp+90h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+30h],xmm6 
+  movdqa      xmm6,[esp+0A0h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+40h],xmm6 
+  movdqa      xmm6,[esp+0B0h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+70h],xmm6 
+  punpcklbw   xmm7,xmm0 
+  punpcklbw   xmm4,xmm0 
+  punpcklbw   xmm5,xmm0 
+  punpcklbw   xmm3,xmm0 
+  movdqa      [esp+50h],xmm7 
+  movdqa      xmm6,xmm4 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  movdqa      xmm0,xmm1 
+  pcmpgtw     xmm0,xmm6 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm4 
+  pabsw       xmm6,xmm6 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pand        xmm0,xmm7 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+30h] 
+  psubw       xmm6,[esp+40h] 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm1,xmm6 
+  movdqa      xmm6,[esp+60h] 
+  psubw       xmm6,[esp+30h] 
+  pabsw       xmm6,xmm6 
+  pand        xmm0,xmm7 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+70h] 
+  psubw       xmm6,[esp+40h] 
+  pabsw       xmm6,xmm6 
+  pand        xmm1,xmm7 
+  pcmpgtw     xmm2,xmm6 
+  pand        xmm1,xmm2 
+  mov         eax,2 
+  movsx       ecx,ax 
+  movd        xmm2,ecx 
+  movdqa      xmm6,xmm2 
+  punpcklwd   xmm6,xmm2 
+  pshufd      xmm2,xmm6,0 
+  movdqa      [esp+20h],xmm2 
+  movdqa      xmm2,xmm3 
+  paddw       xmm2,xmm3 
+  paddw       xmm2,xmm4 
+  paddw       xmm2,[esp+50h] 
+  paddw       xmm2,[esp+20h] 
+  psraw       xmm2,2 
+  movdqa      xmm6,xmm0 
+  pand        xmm6,xmm2 
+  movdqa      xmm2,xmm0 
+  pandn       xmm2,xmm4 
+  por         xmm6,xmm2 
+  movdqa      xmm2,[esp+60h] 
+  movdqa      xmm7,xmm2 
+  paddw       xmm7,xmm2 
+  paddw       xmm7,[esp+30h] 
+  paddw       xmm7,[esp+70h] 
+  paddw       xmm7,[esp+20h] 
+  movdqa      xmm4,xmm1 
+  movdqa      xmm2,xmm1 
+  pandn       xmm2,[esp+30h] 
+  psraw       xmm7,2 
+  pand        xmm4,xmm7 
+  por         xmm4,xmm2 
+  movdqa      xmm2,[esp+50h] 
+  packuswb    xmm6,xmm4 
+  movdqa      [esp+90h],xmm6 
+  movdqa      xmm6,xmm2 
+  paddw       xmm6,xmm2 
+  movdqa      xmm2,[esp+20h] 
+  paddw       xmm6,xmm5 
+  paddw       xmm6,xmm3 
+  movdqa      xmm4,xmm0 
+  pandn       xmm0,xmm5 
+  paddw       xmm6,xmm2 
+  psraw       xmm6,2 
+  pand        xmm4,xmm6 
+  por         xmm4,xmm0 
+  movdqa      xmm0,[esp+70h] 
+  movdqa      xmm5,xmm0 
+  paddw       xmm5,xmm0 
+  movdqa      xmm0,[esp+40h] 
+  paddw       xmm5,xmm0 
+  paddw       xmm5,[esp+60h] 
+  movdqa      xmm3,xmm1 
+  paddw       xmm5,xmm2 
+  psraw       xmm5,2 
+  pand        xmm3,xmm5 
+  pandn       xmm1,xmm0 
+  por         xmm3,xmm1 
+  packuswb    xmm4,xmm3 
+  movdqa      [esp+0A0h],xmm4 
+  mov         esi,dword [esp+10h] 
+  movdqa      xmm0,[esi] 
+  movdqa      xmm1,[esi+10h] 
+  movdqa      xmm2,[esi+20h] 
+  movdqa      xmm3,[esi+30h] 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         esi,dword [esp+1Ch] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+14h] 
+  mov         edi,dword [esp+8] 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         esi,dword [esp+18h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         edi,dword [esp+0Ch] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  pop         edi  
+  pop         esi  
+  mov         esp,ebp 
+  pop         ebp  
+  ret              
+  
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+  
+WELS_EXTERN  DeblockChromaLt4H_sse2
+  
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,108h   
+  mov         ecx,dword [ebp+8] 
+  mov         edx,dword [ebp+0Ch] 
+  mov         eax,dword [ebp+10h] 
+  sub         ecx,2 
+  sub         edx,2 
+  push        esi  
+  lea         esi,[eax+eax*2] 
+  mov         dword [esp+10h],ecx 
+  mov         dword [esp+4],edx 
+  lea         ecx,[ecx+eax*4] 
+  lea         edx,[edx+eax*4] 
+  lea         eax,[esp+6Ch] 
+  push        edi  
+  mov         dword [esp+0Ch],esi 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+10h],edx 
+  mov         dword [esp+1Ch],eax 
+  mov         esi,dword [esp+14h] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+0Ch] 
+  movd        xmm0,dword [esi] 
+  movd        xmm1,dword [esi+ecx] 
+  movd        xmm2,dword [esi+ecx*2] 
+  movd        xmm3,dword [esi+edx] 
+  mov         esi,dword [esp+8] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [esi+ecx] 
+  movd        xmm6,dword [esi+ecx*2] 
+  movd        xmm7,dword [esi+edx] 
+  punpckldq   xmm0,xmm4 
+  punpckldq   xmm1,xmm5 
+  punpckldq   xmm2,xmm6 
+  punpckldq   xmm3,xmm7 
+  mov         esi,dword [esp+18h] 
+  mov         edi,dword [esp+10h] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [edi] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm0,xmm4 
+  movd        xmm4,dword [esi+ecx] 
+  movd        xmm5,dword [edi+ecx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm1,xmm4 
+  movd        xmm4,dword [esi+ecx*2] 
+  movd        xmm5,dword [edi+ecx*2] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm2,xmm4 
+  movd        xmm4,dword [esi+edx] 
+  movd        xmm5,dword [edi+edx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm3,xmm4 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         edi,dword [esp+1Ch] 
+  movdqa      [edi],xmm0 
+  movdqa      [edi+10h],xmm5 
+  movdqa      [edi+20h],xmm1 
+  movdqa      [edi+30h],xmm6 
+  mov         eax,dword [ebp+1Ch] 
+  movsx       cx,byte [eax+3] 
+  movsx       dx,byte [eax+2] 
+  movsx       si,byte [eax+1] 
+  movsx       ax,byte [eax] 
+  movzx       edi,cx 
+  movzx       ecx,cx 
+  movd        xmm2,ecx 
+  movzx       ecx,dx 
+  movzx       edx,dx 
+  movd        xmm3,ecx 
+  movd        xmm4,edx 
+  movzx       ecx,si 
+  movzx       edx,si 
+  movd        xmm5,ecx 
+  pxor        xmm0,xmm0 
+  movd        xmm6,edx 
+  movzx       ecx,ax 
+  movdqa      [esp+60h],xmm0 
+  movzx       edx,ax 
+  movsx       eax,word [ebp+14h] 
+  punpcklwd   xmm6,xmm2 
+  movd        xmm1,edi 
+  movd        xmm7,ecx 
+  movsx       ecx,word [ebp+18h] 
+  movd        xmm0,edx 
+  punpcklwd   xmm7,xmm3 
+  punpcklwd   xmm5,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  punpcklwd   xmm7,xmm5 
+  movdqa      xmm5,[esp+0A0h] 
+  punpcklwd   xmm0,xmm4 
+  punpcklwd   xmm0,xmm6 
+  movdqa      xmm6, [esp+70h] 
+  punpcklwd   xmm0,xmm7 
+  movdqa      xmm7,[esp+80h] 
+  movdqa      xmm2,xmm1 
+  psubw       xmm2,xmm0 
+  movdqa      [esp+0D0h],xmm2 
+  movd        xmm2,eax 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm4,xmm3,0 
+  movd        xmm2,ecx 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm2,xmm3,0 
+  movdqa      xmm3, [esp+90h] 
+  movdqa      [esp+50h],xmm2 
+  movdqa      xmm2,xmm6 
+  punpcklbw   xmm2,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+40h],xmm2 
+  movdqa      [esp+0B0h],xmm6 
+  movdqa      xmm6,[esp+90h] 
+  movdqa      xmm2,xmm7 
+  punpckhbw   xmm7,xmm1 
+  punpckhbw   xmm6,xmm1 
+  punpcklbw   xmm2,xmm1 
+  punpcklbw   xmm3,xmm1 
+  punpcklbw   xmm5,xmm1 
+  movdqa      [esp+0F0h],xmm7 
+  movdqa      [esp+0C0h],xmm6 
+  movdqa      xmm6, [esp+0A0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+0E0h],xmm6 
+  mov         edx,4 
+  movsx       eax,dx 
+  movd        xmm6,eax 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      [esp+30h],xmm6 
+  movdqa      xmm7, [esp+40h] 
+  psubw       xmm7,xmm5 
+  movdqa      xmm6,xmm0 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      [esp+60h],xmm6 
+  movdqa      xmm1, [esp+0D0h] 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm2 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm7 
+  paddw       xmm6,[esp+30h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm1,xmm6 
+  movdqa      xmm7,[esp+50h] 
+  movdqa      [esp+20h],xmm0 
+  movdqa      xmm6, [esp+20h] 
+  pminsw      xmm6,xmm1 
+  movdqa      [esp+20h],xmm6 
+  movdqa      xmm6,xmm4 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1, [esp+40h] 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm7,xmm1 
+  movdqa      xmm1, [esp+50h] 
+  pand        xmm6,xmm7 
+  movdqa      xmm7, [esp+50h] 
+  psubw       xmm5,xmm3 
+  pabsw       xmm5,xmm5 
+  pcmpgtw     xmm1,xmm5 
+  movdqa      xmm5, [esp+0B0h] 
+  psubw       xmm5,[esp+0E0h] 
+  pand        xmm6,xmm1 
+  pand        xmm6, [esp+60h] 
+  movdqa      xmm1, [esp+20h] 
+  pand        xmm1,xmm6 
+  movdqa      xmm6, [esp+0C0h] 
+  movdqa      [esp+40h],xmm1 
+  movdqa      xmm1, [esp+0F0h] 
+  psubw       xmm6,xmm1 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm5 
+  paddw       xmm6, [esp+30h] 
+  movdqa      xmm5, [esp+0D0h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm5,xmm6 
+  pminsw      xmm0,xmm5 
+  movdqa      xmm5,[esp+0C0h] 
+  movdqa      xmm6,xmm1 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm4,xmm6 
+  movdqa      xmm6,[esp+0B0h] 
+  psubw       xmm6,xmm1 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6, [esp+0E0h] 
+  pand        xmm4,xmm7 
+  movdqa      xmm7, [esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  pand        xmm4,xmm7 
+  pand        xmm4,[esp+60h] 
+  pand        xmm0,xmm4 
+  movdqa      xmm4, [esp+40h] 
+  paddw       xmm2,xmm4 
+  paddw       xmm1,xmm0 
+  psubw       xmm3,xmm4 
+  psubw       xmm5,xmm0 
+  packuswb    xmm2,xmm1 
+  packuswb    xmm3,xmm5 
+  movdqa      [esp+80h],xmm2 
+  movdqa      [esp+90h],xmm3 
+  mov         esi,dword [esp+1Ch] 
+  movdqa      xmm0, [esi] 
+  movdqa      xmm1, [esi+10h] 
+  movdqa      xmm2, [esi+20h] 
+  movdqa      xmm3, [esi+30h] 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         esi,dword [esp+14h] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+0Ch] 
+  mov         edi,dword [esp+8] 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         esi,dword [esp+18h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         edi,dword [esp+10h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6  
+  pop         edi  
+  pop         esi   
+  mov         esp,ebp 
+  pop         ebp  
+  ret     
+  
+  
+  
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+  
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+  
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+  
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+  
+    
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push    ebp
+    push    ebx
+    mov     ebp,   esp
+    and     esp,0FFFFFFF0h
+    sub     esp,   10h    
+    
+    mov     eax,   [ebp + 0Ch]  
+    mov     ecx,   [ebp + 10h]
+    lea     edx,   [eax + ecx * 8]
+    lea     ebx,   [ecx*3]
+    
+    movq    xmm0,  [eax] 
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm0,  xmm7  
+    movq    xmm1,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [eax + ecx*2] 
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [eax + ebx]
+    movq    xmm7,  [edx + ebx]
+    punpcklqdq   xmm3,  xmm7
+    
+    lea     eax,   [eax + ecx * 4]
+    lea     edx,   [edx + ecx * 4]
+    movq    xmm4,  [eax] 
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm4,  xmm7  
+    movq    xmm5,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [eax + ecx*2] 
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm6,  xmm7
+    
+    movdqa  [esp],   xmm0
+    movq    xmm7,  [eax + ebx]
+    movq    xmm0,  [edx + ebx]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [esp]
+    
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+    
+    mov    eax,   [ebp + 14h]
+    movdqa  [eax],    xmm4 
+    movdqa  [eax + 10h],  xmm2
+    movdqa  [eax + 20h],  xmm3
+    movdqa  [eax + 30h],  xmm7
+    movdqa  [eax + 40h],  xmm5
+    movdqa  [eax + 50h],  xmm1
+    movdqa  [eax + 60h],  xmm6
+    movdqa  [eax + 70h],  xmm0   
+    
+    mov     esp,   ebp
+    pop     ebx
+    pop     ebp
+    ret
+    
+    
+    
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     ebp
+    mov      ebp,   esp
+    
+    and     esp,  0FFFFFFF0h
+    sub     esp,   10h  
+    
+    mov      eax,   [ebp + 10h]  
+    mov      ecx,   [ebp + 0Ch]
+    mov      edx,   [ebp + 08h]
+      
+    movdqa   xmm0,  [eax]
+    movdqa   xmm1,  [eax + 10h]
+    movdqa   xmm2,  [eax + 20h]
+    movdqa   xmm3,	[eax + 30h]
+    movdqa   xmm4,	[eax + 40h]
+    movdqa   xmm5,	[eax + 50h]
+    movdqa   xmm6,	[eax + 60h]
+    movdqa   xmm7,	[eax + 70h]
+    
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+    
+    lea      eax,   [ecx * 3]
+    
+    movq     [edx],  xmm4 
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+    
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5 
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0    
+    
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+    
+    lea       edx,  [edx + ecx*4]
+    movq     [edx],  xmm4 
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+    
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5 
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0   
+    
+    
+    mov      esp,   ebp
+    pop      ebp
+    ret
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/asm/expand_picture.asm
@@ -1,0 +1,655 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  expand_picture.asm
+;*
+;*  Abstract
+;*      mmxext/sse for expand_frame
+;*
+;*  History
+;*      09/25/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata pData align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+;%define PADDING_SIZE_ASM 	32 	; PADDING_LENGTH
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+
+
+SECTION .text
+
+;WELS_EXTERN expand_picture_luma_mmx
+;WELS_EXTERN expand_picture_chroma_mmx
+WELS_EXTERN ExpandPictureLuma_sse2
+WELS_EXTERN ExpandPictureChromaAlign_sse2	; for chroma alignment
+WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
+
+;;;;;;;expanding result;;;;;;;
+
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;----------------------------
+;aaaa|attttttttttttttttb|bbbb
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;----------------------------
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+
+%macro mov_line_8x4_mmx		3	; dst, stride, mm?
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
+	movdq%4 [%1], %3 		; top(bottom)_0
+	movdq%4 [%1+%2], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdq%4 [%1], %3 		; top(bottom)_2
+	movdq%4 [%1+%2], %3		; top(bottom)_3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
+	movdq%4 [%1], %3 		; top(bottom)_0
+	movdq%4 [%1+%2], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdq%4 [%1], %3 		; top(bottom)_2
+	movdq%4 [%1+%2], %3		; top(bottom)_3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
+	movdqa [%1], %3 		; top(bottom)_0
+	movdqa [%1+16], %3 		; top(bottom)_0
+	movdqa [%1+%2], %3		; top(bottom)_1
+	movdqa [%1+%2+16], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdqa [%1], %3 		; top(bottom)_2
+	movdqa [%1+16], %3 		; top(bottom)_2
+	movdqa [%1+%2], %3		; top(bottom)_3
+	movdqa [%1+%2+16], %3		; top(bottom)_3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
+	movdqa [%1], %3 		; top(bottom)_0
+	movdqa [%1+16], %3 		; top(bottom)_0
+	movdqa [%1+%2], %3		; top(bottom)_1
+	movdqa [%1+%2+16], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdqa [%1], %3 		; top(bottom)_2
+	movdqa [%1+16], %3 		; top(bottom)_2
+	movdqa [%1+%2], %3		; top(bottom)_3
+	movdqa [%1+%2+16], %3		; top(bottom)_3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+	; ebx [width/16(8)]
+	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
+	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
+		
+%if %1 == 32		; for luma
+	sar ebx, 04h 	; width / 16(8) pixels
+.top_bottom_loops:
+	; top
+	movdqa xmm0, [esi]		; first line of picture pData
+	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+	
+	; bottom
+	movdqa xmm1, [eax] 		; last line of picture pData
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+		
+	lea esi, [esi+16]		; top pSrc
+	lea edi, [edi+16]		; top dst
+	lea eax, [eax+16]		; bottom pSrc
+	lea ebp, [ebp+16]		; bottom dst
+	neg ecx 			; positive/negative stride need for next loop?	
+	
+	dec ebx
+	jnz near .top_bottom_loops		
+%elif %1 == 16	; for chroma ??
+	mov edx, ebx
+	sar ebx, 04h 	; (width / 16) pixels
+.top_bottom_loops:
+	; top
+	movdqa xmm0, [esi]		; first line of picture pData
+	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
+	
+	; bottom
+	movdqa xmm1, [eax] 		; last line of picture pData
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
+		
+	lea esi, [esi+16]		; top pSrc
+	lea edi, [edi+16]		; top dst
+	lea eax, [eax+16]		; bottom pSrc
+	lea ebp, [ebp+16]		; bottom dst
+	neg ecx 			; positive/negative stride need for next loop?	
+	
+	dec ebx
+	jnz near .top_bottom_loops
+
+	; for remaining 8 bytes
+	and edx, 0fh		; any 8 bytes left?
+	test edx, edx
+	jz near .to_be_continued	; no left to exit here
+
+	; top
+	movq mm0, [esi]		; remained 8 byte
+	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
+	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
+	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
+	mov_line_end8x4_mmx edi, ecx, mm0	; dst, stride, mm?
+	; bottom
+	movq mm1, [eax]
+	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
+	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
+	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
+	mov_line_end8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
+	WELSEMMS
+
+.to_be_continued:
+%endif
+%endmacro
+
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+	; ecx [height]
+	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
+	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
+;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
+	
+%if %1 == 32		; for luma	
+.left_right_loops:
+	; left
+	mov al, byte [esi]		; pixel pData for left border
+	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [edi], xmm0
+	movdqa [edi+16], xmm0
+	
+	; right
+	mov al, byte [ebx]
+	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [ebp], xmm1
+	movdqa [ebp+16], xmm1
+	
+	lea esi, [esi+edx]		; left pSrc
+	lea edi, [edi+edx]		; left dst
+	lea ebx, [ebx+edx]		; right pSrc
+	lea ebp, [ebp+edx]		; right dst	
+	
+	dec ecx
+	jnz near .left_right_loops		
+%elif %1 == 16	; for chroma ??	
+.left_right_loops:
+	; left
+	mov al, byte [esi]		; pixel pData for left border
+	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [edi], xmm0	
+	
+	; right
+	mov al, byte [ebx]
+	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
+	
+	lea esi, [esi+edx]		; left pSrc
+	lea edi, [edi+edx]		; left dst
+	lea ebx, [ebx+edx]		; right pSrc
+	lea ebp, [ebp+edx]		; right dst	
+	
+	dec ecx
+	jnz near .left_right_loops
+%endif
+%endmacro
+
+%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
+	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+%if %1 == 32		; luma
+	; TL
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_end32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+
+	; TR
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_end32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+
+	; BL
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_end32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+
+	; BR
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_end32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+%elif %1 == 16	; chroma
+	; TL
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
+
+	; TR
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
+
+	; BL
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
+
+	; BR
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
+%endif
+%endmacro
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureLuma_sse2(	uint8_t *pDst,
+;									const int32_t kiStride,
+;									const int32_t kiWidth,
+;									const int32_t kiHeight	);
+;***********************************************************************----------------
+ExpandPictureLuma_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+	
+	; for both top and bottom border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; pDst
+	mov edx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov eax, [esp+36]						; kiHeight
+	; also prepare for cross border pData top-left: xmm3
+;	xor ecx, ecx
+	mov cl, byte [esi]
+	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
+	; load top border
+	mov ecx, edx							; kiStride	
+	neg ecx 								; -kiStride
+	lea edi, [esi+ecx]						; last line of top border
+	; load bottom border 
+	dec eax									; h-1
+	imul eax, edx 							; (h-1)*kiStride
+	lea eax, [esi+eax]						; last line of picture pData
+	sal edx, 05h							; 32*kiStride
+	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
+	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
+	dec ebx									; kiWidth-1
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+;	xor edx, edx
+	mov dl, byte [eax]						; bottom-left
+	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	mov dl, byte [ebx]						; bottom-right
+	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for top & bottom expanding	
+	mov ebx, [esp+32]						; kiWidth
+	exp_top_bottom_sse2	32	
+	
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst: left border pSrc
+	mov edx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov ecx, [esp+36]						; kiHeight
+	; load left border
+	mov eax, -32 							; luma=-32, chroma=-16
+	lea edi, [esi+eax]						; left border dst
+	dec ebx
+	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
+	lea ebp, [ebx+1]						; right border dst	
+	; prepare for cross border pData: top-right with xmm4
+;	xor eax, eax
+	mov al, byte [ebx]						; top-right
+	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
+	; for left & right border expanding
+	exp_left_right_sse2	32, a
+	
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; pDst
+	mov ecx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov edx, [esp+36]						; kiHeight
+	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+	mov eax, -32							; luma=-32, chroma=-16
+	neg ecx										; -stride
+	lea edi, [esi+eax]						
+	lea edi, [edi+ecx]				; last line of top-left border
+	lea ebp, [esi+ebx]
+	lea ebp, [ebp+ecx]				; last line of top-right border
+	add edx, 32								; height+32(16), luma=32, chroma=16
+	mov ecx, [esp+28]					; kiStride
+	imul edx, ecx							; (height+32(16)) * stride
+	lea eax, [edi+edx]						; last line of bottom-left border
+	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	neg ecx										; -kiStride
+	; for left & right border expanding
+	exp_cross_sse2		32, a	
+	
+;	sfence									; commit cache write back memory
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
+;										const int32_t kiStride,
+;										const int32_t kiWidth,
+;										const int32_t kiHeight	);
+;***********************************************************************----------------
+ExpandPictureChromaAlign_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+	
+	; for both top and bottom border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; pDst
+	mov edx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov eax, [esp+36]						; kiHeight
+	; also prepare for cross border pData top-left: xmm3
+;	xor ecx, ecx
+	mov cl, byte [esi]
+	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
+	; load top border
+	mov ecx, edx							; kiStride	
+	neg ecx 								; -kiStride
+	lea edi, [esi+ecx]						; last line of top border
+	; load bottom border 
+	dec eax									; h-1
+	imul eax, edx 							; (h-1)*kiStride
+	lea eax, [esi+eax]						; last line of picture pData
+	sal edx, 04h							; 16*kiStride
+	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
+	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
+	dec ebx									; kiWidth-1
+	lea ebx, [eax+ebx]						; pDst[w-1][h-1]	
+;	xor edx, edx
+	mov dl, byte [eax]						; bottom-left
+	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	mov dl, byte [ebx]						; bottom-right
+	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for top & bottom expanding	
+	mov ebx, [esp+32]						; kiWidth
+	exp_top_bottom_sse2	16	
+	
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; pDst: left border pSrc
+	mov edx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov ecx, [esp+36]						; kiHeight
+	; load left border
+	mov eax, -16 							; luma=-32, chroma=-16
+	lea edi, [esi+eax]						; left border dst
+	dec ebx
+	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
+	lea ebp, [ebx+1]						; right border dst	
+	; prepare for cross border pData: top-right with xmm4
+;	xor eax, eax
+	mov al, byte [ebx]						; top-right
+	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
+	; for left & right border expanding
+	exp_left_right_sse2	16, a
+	
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; pDst
+	mov ecx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov edx, [esp+36]						; kiHeight
+	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+	mov eax, -16							; chroma=-16
+	neg ecx										; -stride
+	lea edi, [esi+eax]						
+	lea edi, [edi+ecx]				; last line of top-left border
+	lea ebp, [esi+ebx]				
+	lea ebp, [ebp+ecx]				; last line of top-right border
+	mov ecx, [esp+28]						; kiStride
+	add edx, 16							; height+16, luma=32, chroma=16
+	imul edx, ecx							; (kiHeight+16) * kiStride
+	lea eax, [edi+edx]						; last line of bottom-left border
+	lea ebx, [ebp+edx]						; last line of bottom-right border
+	neg ecx										; -kiStride
+	; for left & right border expanding
+	exp_cross_sse2		16, a
+	
+;	sfence									; commit cache write back memory
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
+;										const int32_t kiStride,
+;										const int32_t kiWidth,
+;										const int32_t kiHeight	);
+;***********************************************************************----------------
+ExpandPictureChromaUnalign_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+	
+	; for both top and bottom border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; pDst
+	mov edx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov eax, [esp+36]						; kiHeight
+	; also prepare for cross border pData top-left: xmm3
+;	xor ecx, ecx
+	mov cl, byte [esi]
+	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
+	; load top border
+	mov ecx, edx							; kiStride	
+	neg ecx 								; -kiStride
+	lea edi, [esi+ecx]						; last line of top border
+	; load bottom border 
+	dec eax									; h-1
+	imul eax, edx 							; (h-1)*kiStride
+	lea eax, [esi+eax]						; last line of picture pData
+	sal edx, 04h							; 16*kiStride
+	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
+	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
+	dec ebx									; kiWidth-1
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+;	xor edx, edx
+	mov dl, byte [eax]						; bottom-left
+	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	mov dl, byte [ebx]						; bottom-right
+	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for top & bottom expanding	
+	mov ebx, [esp+32]						; kiWidth
+	exp_top_bottom_sse2	16	
+	
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst: left border pSrc
+	mov edx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov ecx, [esp+36]						; kiHeight
+	; load left border
+	mov eax, -16 							; luma=-32, chroma=-16
+	lea edi, [esi+eax]						; left border dst
+	dec ebx
+	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
+	lea ebp, [ebx+1]						; right border dst	
+	; prepare for cross border pData: top-right with xmm4
+;	xor eax, eax
+	mov al, byte [ebx]						; top-right
+	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for left & right border expanding
+	exp_left_right_sse2	16, u
+	
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst
+	mov ecx, [esp+28]						; kiStride
+	mov ebx, [esp+32]						; kiWidth
+	mov edx, [esp+36]						; kiHeight
+	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+	neg ecx									; -kiStride
+	mov eax, -16							; chroma=-16
+	lea edi, [esi+eax]						
+	lea edi, [edi+ecx]				; last line of top-left border
+	lea ebp, [esi+ebx]						
+	lea ebp, [ebp+ecx]				; last line of top-right border
+	mov ecx, [esp+28]						; kiStride
+	add edx, 16							; kiHeight+16, luma=32, chroma=16
+	imul edx, ecx							; (kiHeight+16) * kiStride
+	lea eax, [edi+edx]						; last line of bottom-left border
+	lea ebx, [ebp+edx]						; last line of bottom-right border
+	neg ecx									; -kiStride
+	; for left & right border expanding
+	exp_cross_sse2		16, u
+	
+;	sfence									; commit cache write back memory
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	
+	ret
+
--- /dev/null
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -1,0 +1,1498 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 and mmx function for intra predict operations(decoder)
+;*
+;*  History
+;*      18/09/2009 Created
+;*		19/11/2010 Added
+;*					WelsI16x16LumaPredDcTop_sse2, WelsI16x16LumaPredDcNA_sse2,
+;*					WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2 
+;*					and WelsIChromaPredDcNA_mmx
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+BITS 32
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+%if 1
+	%define WELSEMMS	emms
+%else
+	%define WELSEMMS
+%endif
+
+align 16
+sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
+
+; for chroma plane mode
+sse2_plane_inc_c dw 1, 2, 3, 4
+sse2_plane_dec_c dw 4, 3, 2, 1
+align 16
+sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
+
+align 16
+mmx_01bytes:		times 16	db 1
+
+align 16
+mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
+
+align 16
+sse2_dc_0x80: times 16 db 0x80
+align 16
+sse2_wd_0x02: times 8 dw 0x02
+
+;*******************************************************************************
+; macros
+;*******************************************************************************
+;xmm0, xmm1, xmm2, eax, ecx
+;lower 64 bits of xmm0 save the result
+%macro SSE2_PRED_H_4X4_TWO_LINE 5
+    movd		%1,	[%4-1]
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+	
+	;add			%4,	%5
+	movd		%2,	[%4+%5-1]
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3	
+	punpckldq	%1,	%2
+%endmacro
+
+
+%macro	LOAD_COLUMN 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,	%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpcklwd %1,	%3
+		lea		%5,	[%5+2*%6]	
+		movd	%4,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %4,	%2
+		lea		%5,	[%5+2*%6]	
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		lea		%5,	[%5+2*%6]
+		punpcklbw %3,	%2
+		punpcklwd %4,	%3
+		punpckhdq %1,	%4	
+%endmacro	
+
+%macro  SUMW_HORIZON 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro	LOAD_COLUMN_C 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpckhwd %1,	%3
+		lea		%5,	[%5+2*%6]			
+%endmacro
+
+%macro LOAD_2_LEFT_AND_ADD 0
+        lea         eax, [eax+2*ecx]
+        movzx		edx, byte [eax-0x01]
+        add			ebx, edx
+        movzx		edx, byte [eax+ecx-0x01]
+        add			ebx, edx
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+WELS_EXTERN WelsI4x4LumaPredH_sse2
+WELS_EXTERN WelsI4x4LumaPredDDR_mmx
+WELS_EXTERN WelsI16x16LumaPredPlane_sse2
+WELS_EXTERN WelsI4x4LumaPredDc_sse2
+
+ALIGN 16
+;*******************************************************************************
+;   void_t __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
+;   
+;	pPred must align to 16
+;*******************************************************************************
+WelsI4x4LumaPredH_sse2:
+	mov			eax,	[esp+4]			;pPred
+	mov			ecx,	[esp+8]			;kiStride
+
+	movzx		edx,	byte [eax-1]
+	movd		xmm0,	edx
+	pmuludq		xmm0,	[mmx_01bytes]
+	
+	movzx		edx,	byte [eax+ecx-1]
+	movd		xmm1,	edx
+	pmuludq		xmm1,	[mmx_01bytes]
+
+	lea			eax,	[eax+ecx]
+	movzx		edx,	byte [eax+ecx-1]
+	movd		xmm2,	edx
+	pmuludq		xmm2,	[mmx_01bytes]
+	
+	movzx		edx,	byte [eax+2*ecx-1]
+	movd		xmm3,	edx	
+	pmuludq		xmm3,	[mmx_01bytes]
+	
+	sub         eax,    ecx
+	movd        [eax], xmm0
+	movd        [eax+ecx], xmm1
+	lea         eax, [eax+2*ecx]
+	movd        [eax], xmm2
+	movd        [eax+ecx], xmm3
+	
+	ret
+	
+;*******************************************************************************
+; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WelsI16x16LumaPredPlane_sse2:
+%define pushsize	4
+		push	esi
+		mov		esi,	[esp + pushsize + 4]
+		mov		ecx,	[esp + pushsize + 8]
+		sub		esi,	1
+		sub		esi,	ecx
+		
+		;for H
+		pxor	xmm7,	xmm7	
+		movq	xmm0,	[esi]
+		movdqa	xmm5,	[sse2_plane_dec]
+		punpcklbw xmm0,	xmm7
+		pmullw	xmm0,	xmm5
+		movq	xmm1,	[esi + 9]
+		movdqa	xmm6,	[sse2_plane_inc]
+		punpcklbw xmm1,	xmm7
+		pmullw	xmm1,	xmm6
+		psubw	xmm1,	xmm0
+		
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
+		movsx	eax,	ax
+		imul	eax,	5
+		add		eax,	32
+		sar		eax,	6			; b = (5 * H + 32) >> 6;
+		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
+		
+		movzx	edx,	BYTE [esi+16]	
+		sub	esi, 3
+		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
+			
+		add		esi,	3
+		movzx	eax,	BYTE [esi+8*ecx]
+		add		edx,	eax
+		shl		edx,	4			;	a = (left[15*kiStride] + top[15]) << 4;
+		
+		sub	esi, 3
+		add		esi,	ecx
+		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
+		pxor	xmm4,	xmm4	
+		punpckhbw xmm0,	xmm4
+		pmullw	xmm0,	xmm5
+		punpckhbw xmm7,	xmm4
+		pmullw	xmm7,	xmm6
+		psubw	xmm7,	xmm0
+		
+		SUMW_HORIZON   xmm7,xmm0,xmm2
+		movd    eax,   xmm7			; V
+		movsx	eax,	ax
+
+		imul	eax,	5
+		add		eax,	32
+		sar		eax,	6				; c = (5 * V + 32) >> 6;
+		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c		
+		
+		mov		esi,	[esp + pushsize + 4]
+		add		edx,	16
+		imul	eax,	-7
+		add		edx,	eax				; s = a + 16 + (-7)*c		
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
+		
+		xor		eax,	eax
+		movdqa	xmm5,	[sse2_plane_inc_minus]
+		
+get_i16x16_luma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		movdqa	xmm3,	xmm1
+		pmullw	xmm3,	xmm6
+		paddw	xmm3,	xmm0
+		psraw	xmm3,	5	
+		packuswb xmm2,	xmm3
+		movdqa	[esi],	xmm2
+		paddw	xmm0,	xmm4
+		add		esi,	ecx
+		inc		eax
+		cmp		eax,	16
+		jnz get_i16x16_luma_pred_plane_sse2_1					
+		
+		pop		esi
+		ret
+		
+		
+		
+;*******************************************************************************
+; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
+    lea     eax,	[eax+ecx*2]
+    
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [eax],	xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [eax+ecx],	xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+    mov     eax, [esp+4]    ; pPred
+    mov     ecx, [esp+8]    ; kiStride
+    
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [eax],		xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [eax+ecx],	xmm0
+    
+	SSE2_PRED_H_16X16_TWO_LINE_DEC 
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+   
+    ret
+    
+;*******************************************************************************
+; void_t WelsI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+    mov     edx, [esp+4]    ; pPred
+    mov     ecx, [esp+8]    ; kiStride
+    
+    sub     edx, ecx
+    movdqa  xmm0, [edx]
+    
+    movdqa  [edx+ecx], xmm0
+    lea     edx, [edx+2*ecx]
+    movdqa  [edx],     xmm0
+    movdqa  [edx+ecx], xmm0
+    lea     edx, [edx+2*ecx]
+    movdqa  [edx],     xmm0
+    movdqa  [edx+ecx], xmm0
+    lea     edx, [edx+2*ecx]
+    movdqa  [edx],     xmm0
+    movdqa  [edx+ecx], xmm0
+    lea     edx, [edx+2*ecx]
+    movdqa  [edx],     xmm0
+    movdqa  [edx+ecx], xmm0
+    lea     edx, [edx+2*ecx]
+    movdqa  [edx],     xmm0
+    movdqa  [edx+ecx], xmm0
+    lea     edx, [edx+2*ecx]
+    movdqa  [edx],     xmm0
+    movdqa  [edx+ecx], xmm0
+    lea     edx, [edx+2*ecx]
+    movdqa  [edx],     xmm0
+    movdqa  [edx+ecx], xmm0
+    lea     edx, [edx+2*ecx]
+    movdqa  [edx],     xmm0
+        
+    ret
+    
+;*******************************************************************************
+; void_t WelsIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
+;*******************************************************************************
+WELS_EXTERN WelsIChromaPredPlane_sse2
+WelsIChromaPredPlane_sse2:
+%define pushsize	4
+		push	esi
+		mov		esi,	[esp + pushsize + 4]	;pPred
+		mov		ecx,	[esp + pushsize + 8]	;kiStride
+		sub		esi,	1
+		sub		esi,	ecx
+		
+		pxor	mm7,	mm7	
+		movq	mm0,	[esi]
+		movq	mm5,	[sse2_plane_dec_c]
+		punpcklbw mm0,	mm7
+		pmullw	mm0,	mm5
+		movq	mm1,	[esi + 5]
+		movq	mm6,	[sse2_plane_inc_c]
+		punpcklbw mm1,	mm7
+		pmullw	mm1,	mm6
+		psubw	mm1,	mm0
+		
+		movq2dq xmm1,   mm1
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    eax,	xmm1
+		movsx	eax,	ax
+		imul	eax,	17
+		add		eax,	16
+		sar		eax,	5			; b = (17 * H + 16) >> 5;
+		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
+		
+		movzx	edx,	BYTE [esi+8]
+		sub	esi, 3
+		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
+
+		add		esi,	3
+		movzx	eax,	BYTE [esi+4*ecx]
+		add		edx,	eax
+		shl		edx,	4			; a = (left[7*kiStride] + top[7]) << 4;
+		
+		sub	esi, 3
+		add		esi,	ecx
+		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
+		pxor	mm4,	mm4	
+		punpckhbw mm0,	mm4
+		pmullw	mm0,	mm5
+		punpckhbw mm7,	mm4
+		pmullw	mm7,	mm6
+		psubw	mm7,	mm0
+		
+		movq2dq xmm7,   mm7
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm7,xmm0,xmm2
+		movd    eax,    xmm7			; V
+		movsx	eax,	ax
+
+		imul	eax,	17
+		add		eax,	16
+		sar		eax,	5				; c = (17 * V + 16) >> 5;
+		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c		
+		
+		mov		esi,	[esp + pushsize + 4]
+		add		edx,	16
+		imul	eax,	-3
+		add		edx,	eax				; s = a + 16 + (-3)*c		
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
+		
+		xor		eax,	eax
+		movdqa	xmm5,	[sse2_plane_mul_b_c]
+		
+get_i_chroma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		packuswb xmm2,	xmm2
+		movq	[esi],	xmm2
+		paddw	xmm0,	xmm4
+		add		esi,	ecx
+		inc		eax
+		cmp		eax,	8
+		jnz get_i_chroma_pred_plane_sse2_1					
+		
+		pop		esi
+		WELSEMMS
+		ret	
+		
+ALIGN 16
+;*******************************************************************************
+;	0 |1 |2 |3 |4 |
+;	6 |7 |8 |9 |10|
+;	11|12|13|14|15|
+;	16|17|18|19|20|
+;	21|22|23|24|25|
+;	7 is the start pixel of current 4x4 block
+;	pPred[7] = ([6]+[0]*2+[1]+2)/4
+;
+;   void_t __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
+;   
+;*******************************************************************************
+WelsI4x4LumaPredDDR_mmx:	
+	mov			edx,[esp+4]			;pPred
+	mov         eax,edx
+	mov			ecx,[esp+8]		;kiStride
+	
+	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
+	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
+	punpckhbw   mm2,[eax-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+	movd        mm3,[eax]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+	psllq       mm3,18h				;mm3[5]=[1]
+	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	lea			eax,[eax+ecx*2-8h]		;set eax point to 12
+	movq        mm4,[eax+ecx]		;get value of 16, mm4[8]=[16]
+	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[16]
+	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+	movq        mm4,[eax+ecx*2]		;mm4[8]=[21]
+	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[21]
+	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
+	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
+	pand        mm1,[mmx_01bytes]	;set the odd bit
+	psubusb     mm3,mm1				;decrease 1 from odd bytes
+	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+	
+	lea         edx,[edx+ecx]
+	movd        [edx+2*ecx],mm2 
+	sub         edx,ecx
+	psrlq       mm2,8 
+	movd        [edx+2*ecx],mm2 
+	psrlq       mm2,8 
+	movd        [edx+ecx],mm2 
+	psrlq       mm2,8 
+	movd        [edx],mm2
+	WELSEMMS
+	ret
+	
+ALIGN 16
+;*******************************************************************************
+;	0 |1 |2 |3 |4 |
+;	5 |6 |7 |8 |9 |
+;	10|11|12|13|14|
+;	15|16|17|18|19|
+;	20|21|22|23|24|
+;	6 is the start pixel of current 4x4 block
+;	pPred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+;
+;   void_t __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
+;   
+;*******************************************************************************
+WelsI4x4LumaPredDc_sse2:	
+	mov         eax,[esp+4]			;pPred
+	mov			ecx,[esp+8]			;kiStride
+	push		ebx
+		
+	movzx		edx,	byte [eax-1h]
+	
+	sub			eax,	ecx
+	movd		xmm0,	[eax]
+	pxor		xmm1,	xmm1
+	psadbw		xmm0,	xmm1
+	
+	movd		ebx,	xmm0
+	add			ebx,	edx
+	
+	movzx		edx,	byte [eax+ecx*2-1h]
+	add			ebx,	edx
+	
+	lea			eax,	[eax+ecx*2-1]
+	movzx		edx,	byte [eax+ecx]
+	add			ebx,	edx
+	
+	movzx		edx,	byte [eax+ecx*2]
+	add			ebx,	edx
+	add			ebx,	4
+	sar			ebx,	3
+	imul		ebx,	0x01010101
+	
+	mov			edx,	[esp+8]			;pPred
+	mov         [edx],       ebx
+	mov         [edx+ecx],   ebx
+	mov         [edx+2*ecx], ebx
+	lea         edx, [edx+2*ecx]
+	mov         [edx+ecx],   ebx
+
+	pop ebx
+	ret	
+	
+ALIGN 16
+;*******************************************************************************
+;	void_t __cdecl WelsIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
+;   copy 8 pixel of 8 line from left
+;*******************************************************************************
+%macro MMX_PRED_H_8X8_ONE_LINE 4
+	movq		%1,		[%3-8]
+	psrlq		%1,		38h
+	
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+%macro MMX_PRED_H_8X8_ONE_LINEE 4
+	movq		%1,		[%3+ecx-8]
+	psrlq		%1,		38h
+	
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+WELS_EXTERN WelsIChromaPredH_mmx
+WelsIChromaPredH_mmx:
+	mov			edx,	[esp+4]			;pPred
+	mov         eax,	edx
+	mov			ecx,	[esp+8]			;kiStride
+	
+	movq		mm0,	[eax-8]
+	psrlq		mm0,	38h
+	
+	pmullw		mm0,		[mmx_01bytes]
+	pshufw		mm0,	mm0,	0
+	movq		[edx],	mm0
+	
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+	
+	lea			eax, [eax+ecx*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
+	
+	lea         edx, [edx+2*ecx]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+	
+	lea			eax, [eax+ecx*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
+	
+	lea         edx, [edx+2*ecx]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+	
+	lea			eax, [eax+ecx*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
+
+    lea         edx, [edx+2*ecx]
+	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+		
+	WELSEMMS
+	ret	
+	
+ALIGN 16
+;*******************************************************************************
+;	void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
+;   copy pixels from top 4 pixels
+;*******************************************************************************
+WELS_EXTERN get_i4x4_luma_pred_v_asm
+get_i4x4_luma_pred_v_asm:
+	mov			eax,	[esp+4]        ;pPred
+	mov			ecx,	[esp+8]        ;kiStride
+	
+	sub			eax,	ecx
+	mov         edx,    [eax]
+	mov		    [eax+ecx],	 edx
+	mov			[eax+2*ecx], edx
+	lea         eax, [eax+2*ecx]
+	mov			[eax+ecx],	 edx
+	mov			[eax+2*ecx], edx
+	
+	ret	
+
+ALIGN 16
+;*******************************************************************************
+;	void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
+;   copy 8 pixels from top 8 pixels
+;*******************************************************************************
+WELS_EXTERN WelsIChromaPredV_mmx
+WelsIChromaPredV_mmx:
+	mov			eax,		[esp+4]    ;pPred
+	mov			ecx,		[esp+8]    ;kiStride
+	
+	sub			eax,		ecx
+	movq		mm0,		[eax]
+
+	movq		[eax+ecx],		mm0
+	movq		[eax+2*ecx],	mm0
+	lea         eax, [eax+2*ecx]
+	movq		[eax+ecx],      mm0
+	movq		[eax+2*ecx],    mm0
+	lea         eax, [eax+2*ecx]
+	movq		[eax+ecx],      mm0
+	movq		[eax+2*ecx],    mm0
+	lea         eax, [eax+2*ecx]
+	movq		[eax+ecx],      mm0
+	movq		[eax+2*ecx],    mm0
+	
+	WELSEMMS
+	ret
+	
+	
+	ALIGN 16
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |a |b |
+;	|g |h |e |f |
+;	|i |j |g |h |
+
+;   a = (1 + lt + l0)>>1
+;   e = (1 + l0 + l1)>>1
+;   g = (1 + l1 + l2)>>1
+;   i = (1 + l2 + l3)>>1
+
+;   d = (2 + t0 + (t1<<1) + t2)>>2
+;   c = (2 + lt + (t0<<1) + t1)>>2
+;   b = (2 + l0 + (lt<<1) + t0)>>2
+
+;   f = (2 + l1 + (l0<<1) + lt)>>2
+;   h = (2 + l2 + (l1<<1) + l0)>>2
+;   j = (2 + l3 + (l2<<1) + l1)>>2   
+;   [b a f e h g j i] + [d c b a] --> mov to memory
+;   
+;   void_t WelsI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsI4x4LumaPredHD_mmx
+WelsI4x4LumaPredHD_mmx:	
+	mov			edx, [esp+4]			; pPred
+	mov         eax, edx
+	mov			ecx, [esp+8]            ; kiStride
+	sub         eax, ecx
+	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+	
+	movd        mm1, [eax+2*ecx-4]        
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+	lea         eax, [eax+2*ecx]
+	movd        mm2, [eax+2*ecx-4]        
+	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+	psrlq       mm2, 20h
+	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+	
+	movq        mm1, mm0
+	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+	movq        mm2, mm0
+	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+	movq        mm3, mm2
+	movq        mm4, mm1
+	pavgb       mm1, mm0
+	
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm4				; decrease 1 from odd bytes
+	
+	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+	
+	movq        mm4, mm0
+	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+	
+	psrlq       mm2, 20h
+	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+	movq        mm4, mm3
+	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+	
+	movd        [edx], mm2
+	lea         edx, [edx+ecx]
+	movd        [edx+2*ecx], mm3
+	sub         edx, ecx
+	psrlq       mm3, 10h
+	movd        [edx+2*ecx], mm3
+	psrlq       mm3, 10h
+	movd        [edx+ecx], mm3
+	WELSEMMS
+	ret
+	
+	
+	
+ALIGN 16
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|c |d |e |f |
+;	|e |f |g |g |
+;	|g |g |g |g |
+
+;   a = (1 + l0 + l1)>>1
+;   c = (1 + l1 + l2)>>1
+;   e = (1 + l2 + l3)>>1
+;   g = l3
+
+;   b = (2 + l0 + (l1<<1) + l2)>>2
+;   d = (2 + l1 + (l2<<1) + l3)>>2
+;   f = (2 + l2 + (l3<<1) + l3)>>2
+ 
+;   [g g f e d c b a] + [g g g g] --> mov to memory
+;   
+;   void_t WelsI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsI4x4LumaPredHU_mmx
+WelsI4x4LumaPredHU_mmx:	
+	mov			edx, [esp+4]			; pPred
+	mov         eax, edx
+	mov			ecx, [esp+8]            ; kiStride
+	
+	movd        mm0, [eax-4]            ; mm0[3] = l0
+	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
+	lea         eax, [eax+2*ecx]
+	movd        mm2, [eax-4]            ; mm2[3] = l2
+	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
+	punpcklbw   mm2, mm4
+	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+	
+	psrlq       mm4, 18h
+	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+	psrlq       mm0, 8h
+	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+	
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+	
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+	movq        mm5, mm2
+	pavgb       mm2, mm0
+	
+	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
+	pand        mm5, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm5				; decrease 1 from odd bytes
+	
+	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+	
+	psrlq       mm2, 8h
+	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+	
+	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+	
+	psrlq       mm4, 20h
+	lea         edx, [edx+ecx]
+	movd        [edx+2*ecx], mm4
+	
+	sub         edx, ecx
+	movd        [edx], mm1
+	psrlq       mm1, 10h
+	movd        [edx+ecx], mm1
+	psrlq       mm1, 10h
+	movd        [edx+2*ecx], mm1
+	WELSEMMS
+	ret
+	
+	
+	
+ALIGN 16
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	l3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|i |a |b |c |
+;	|j |e |f |g |
+
+;   a = (1 + lt + t0)>>1
+;   b = (1 + t0 + t1)>>1
+;   c = (1 + t1 + t2)>>1
+;   d = (1 + t2 + t3)>>1
+
+;   e = (2 + l0 + (lt<<1) + t0)>>2
+;   f = (2 + lt + (t0<<1) + t1)>>2
+;   g = (2 + t0 + (t1<<1) + t2)>>2
+
+;   h = (2 + t1 + (t2<<1) + t3)>>2
+;   i = (2 + lt + (l0<<1) + l1)>>2
+;   j = (2 + l0 + (l1<<1) + l2)>>2   
+;   
+;   void_t WelsI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsI4x4LumaPredVR_mmx
+WelsI4x4LumaPredVR_mmx:	
+	mov			edx, [esp+4]			; pPred
+	mov         eax, edx
+	mov			ecx, [esp+8]            ; kiStride
+	sub         eax, ecx
+	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+	
+	movd        mm1, [eax+2*ecx-4]        
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+	lea         eax, [eax+2*ecx]
+	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+	psrlq       mm2, 28h
+	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+	
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+	
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+	movq        mm3, mm2
+	pavgb       mm2, mm0
+	
+	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm3				; decrease 1 from odd bytes
+	
+	movq        mm3, mm0
+	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+	movq        mm2, mm3
+	
+	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+	movd        [edx], mm1
+	
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+	movd        [edx+ecx], mm2
+	
+	movq        mm4, mm3
+	psllq       mm4, 20h
+	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+	
+	movq        mm5, mm3
+	psllq       mm5, 28h
+	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+	
+	psllq       mm1, 8h
+	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+	movd        [edx+2*ecx], mm4
+	
+	psllq       mm2, 8h
+	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+	lea         edx, [edx+2*ecx]
+	movd        [edx+ecx], mm5
+	WELSEMMS
+	ret
+	
+ALIGN 16
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|b |c |d |e |
+;	|c |d |e |f |
+;	|d |e |f |g |
+
+;   a = (2 + t0 + t2 + (t1<<1))>>2
+;   b = (2 + t1 + t3 + (t2<<1))>>2
+;   c = (2 + t2 + t4 + (t3<<1))>>2
+;   d = (2 + t3 + t5 + (t4<<1))>>2
+
+;   e = (2 + t4 + t6 + (t5<<1))>>2
+;   f = (2 + t5 + t7 + (t6<<1))>>2
+;   g = (2 + t6 + t7 + (t7<<1))>>2
+ 
+;   [g f e d c b a] --> mov to memory
+;   
+;   void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsI4x4LumaPredDDL_mmx
+WelsI4x4LumaPredDDL_mmx:	
+	mov			edx, [esp+4]			; pPred
+	mov         eax, edx
+	mov			ecx, [esp+8]            ; kiStride
+	sub         eax, ecx
+	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+	
+	movq        mm3, mm0
+	psrlq       mm3, 38h
+	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+	
+	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+	psrlq       mm2, 8h
+	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+
+	movq        mm3, mm1
+	pavgb       mm1, mm2
+	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm3				; decrease 1 from odd bytes
+	
+	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+	
+	psrlq       mm0, 8h
+	movd        [edx], mm0
+	psrlq       mm0, 8h
+	movd        [edx+ecx], mm0
+	psrlq       mm0, 8h
+	movd        [edx+2*ecx], mm0
+	psrlq       mm0, 8h
+	lea         edx, [edx+2*ecx]
+	movd        [edx+ecx], mm0
+	WELSEMMS
+	ret
+	
+	
+ALIGN 16
+;*******************************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|b |c |d |i |
+;	|f |g |h |j |
+
+;   a = (1 + t0 + t1)>>1
+;   b = (1 + t1 + t2)>>1
+;   c = (1 + t2 + t3)>>1
+;   d = (1 + t3 + t4)>>1
+;   i = (1 + t4 + t5)>>1
+
+;   e = (2 + t0 + (t1<<1) + t2)>>2
+;   f = (2 + t1 + (t2<<1) + t3)>>2
+;   g = (2 + t2 + (t3<<1) + t4)>>2
+;   h = (2 + t3 + (t4<<1) + t5)>>2
+;   j = (2 + t4 + (t5<<1) + t6)>>2
+ 
+;   [i d c b a] + [j h g f e] --> mov to memory
+;   
+;   void_t WelsI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsI4x4LumaPredVL_mmx
+WelsI4x4LumaPredVL_mmx:	
+	mov			edx, [esp+4]			; pPred
+	mov         eax, edx
+	mov			ecx, [esp+8]            ; kiStride
+	
+	sub         eax, ecx
+	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+	
+	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+
+	movq        mm3, mm1
+	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+	
+	movq        mm4, mm2
+	pavgb       mm2, mm0	
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm4				; decrease 1 from odd bytes
+	
+	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+	
+	movd        [edx], mm3
+	psrlq       mm3, 8h
+	movd        [edx+2*ecx], mm3
+	
+	movd        [edx+ecx], mm2
+	psrlq       mm2, 8h
+	lea         edx, [edx+2*ecx]
+	movd        [edx+ecx], mm2
+	WELSEMMS
+	ret
+	
+ALIGN 16
+;*******************************************************************************
+;
+;   void_t WelsIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsIChromaPredDc_sse2
+WelsIChromaPredDc_sse2:	
+	push        ebx
+	mov         eax, [esp+8]			; pPred
+	mov			ecx, [esp+12]           ; kiStride
+	
+	sub         eax, ecx
+	movq        mm0, [eax]
+
+	movzx		ebx, byte [eax+ecx-0x01] ; l1
+	lea         eax, [eax+2*ecx]
+	movzx		edx, byte [eax-0x01]     ; l2
+	add			ebx, edx
+	movzx		edx, byte [eax+ecx-0x01] ; l3
+	add			ebx, edx
+	lea         eax, [eax+2*ecx]
+	movzx		edx, byte [eax-0x01]     ; l4
+	add			ebx, edx
+	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
+	
+	movzx		ebx, byte [eax+ecx-0x01] ; l5
+	lea         eax, [eax+2*ecx]
+	movzx		edx, byte [eax-0x01]     ; l6
+	add			ebx, edx
+	movzx		edx, byte [eax+ecx-0x01] ; l7
+	add			ebx, edx
+	lea         eax, [eax+2*ecx]
+	movzx		edx, byte [eax-0x01]     ; l8
+	add			ebx, edx
+	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
+	
+	movq        mm3, mm0
+	psrlq       mm0, 0x20
+	psllq       mm3, 0x20
+	psrlq       mm3, 0x20
+	pxor		mm4, mm4
+	psadbw		mm0, mm4
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2	
+	
+	paddq       mm3, mm1
+	movq        mm1, mm2
+	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+	
+	movq        mm4, [mmx_0x02]
+	
+	paddq       mm0, mm4
+	psrlq       mm0, 0x02
+	
+	paddq       mm2, mm4
+	psrlq       mm2, 0x02
+	
+	paddq       mm3, mm4
+	paddq       mm3, mm4
+	psrlq       mm3, 0x03
+	
+	paddq       mm1, mm4
+	paddq       mm1, mm4
+	psrlq       mm1, 0x03
+	
+	pmuludq     mm0, [mmx_01bytes]
+	pmuludq     mm3, [mmx_01bytes]
+	psllq       mm0, 0x20
+	pxor        mm0, mm3                 ; mm0 = m_up
+	
+	pmuludq     mm2, [mmx_01bytes]
+	pmuludq     mm1, [mmx_01bytes]
+	psllq       mm1, 0x20
+	pxor        mm1, mm2                 ; mm2 = m_down
+	
+	mov         edx, [esp+8]			 ; pPred
+	
+	movq        [edx],       mm0
+	movq        [edx+ecx],   mm0
+	movq        [edx+2*ecx], mm0
+	lea         edx, [edx+2*ecx]
+	movq        [edx+ecx],   mm0
+	
+	movq        [edx+2*ecx], mm1
+	lea         edx, [edx+2*ecx]
+	movq        [edx+ecx],   mm1
+	movq        [edx+2*ecx], mm1
+	lea         edx, [edx+2*ecx]
+	movq        [edx+ecx],   mm1
+	
+	pop         ebx
+	WELSEMMS
+	ret
+	
+	
+	
+ALIGN 16
+;*******************************************************************************
+;
+;   void_t WelsI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsI16x16LumaPredDc_sse2
+WelsI16x16LumaPredDc_sse2:	
+	push        ebx
+	mov         eax, [esp+8]			; pPred
+	mov			ecx, [esp+12]           ; kiStride
+	
+	sub         eax, ecx
+	movdqa      xmm0, [eax]             ; read one row
+	pxor		xmm1, xmm1
+	psadbw		xmm0, xmm1
+	movdqa      xmm1, xmm0
+	psrldq      xmm1, 0x08
+	pslldq      xmm0, 0x08
+	psrldq      xmm0, 0x08
+	paddw       xmm0, xmm1
+	
+	movzx		ebx, byte [eax+ecx-0x01]
+	movzx		edx, byte [eax+2*ecx-0x01]
+	add			ebx, edx
+	lea         eax, [eax+ecx]
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	add         ebx, 0x10
+	movd        xmm1, ebx
+	paddw       xmm0, xmm1
+	psrld       xmm0, 0x05
+	pmuludq     xmm0, [mmx_01bytes]
+	pshufd      xmm0, xmm0, 0
+	
+	mov         edx, [esp+8]			; pPred
+	
+	movdqa      [edx],       xmm0
+	movdqa      [edx+ecx],   xmm0
+	movdqa      [edx+2*ecx], xmm0
+	lea         edx,         [edx+2*ecx]
+	
+	movdqa      [edx+ecx],   xmm0
+	movdqa      [edx+2*ecx], xmm0
+	lea         edx,         [edx+2*ecx]
+	
+	movdqa      [edx+ecx],   xmm0
+	movdqa      [edx+2*ecx], xmm0
+	lea         edx,         [edx+2*ecx]
+	
+	movdqa      [edx+ecx],   xmm0
+	movdqa      [edx+2*ecx], xmm0
+	lea         edx,         [edx+2*ecx]
+	
+	movdqa      [edx+ecx],   xmm0
+	movdqa      [edx+2*ecx], xmm0
+	lea         edx,         [edx+2*ecx]
+	
+	movdqa      [edx+ecx],   xmm0
+	movdqa      [edx+2*ecx], xmm0
+	lea         edx,         [edx+2*ecx]
+	
+	movdqa      [edx+ecx],   xmm0
+	movdqa      [edx+2*ecx], xmm0
+	lea         edx,         [edx+2*ecx]
+	
+	movdqa      [edx+ecx],   xmm0
+
+	pop         ebx
+
+	ret
+	
+;*******************************************************************************
+; for intra prediction as follows, 11/19/2010
+;*******************************************************************************
+
+ALIGN 16
+;*******************************************************************************
+;	void_t WelsI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
+WelsI16x16LumaPredDcTop_sse2:
+	push ebx
+	
+	%define PUSH_SIZE 4
+	
+	mov eax, [esp+PUSH_SIZE+4]	; pPred
+	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+	
+	mov ecx, ebx
+	neg ecx
+	movdqa xmm0, [eax+ecx]		; pPred-kiStride, top line
+	pxor xmm7, xmm7
+	movdqa xmm1, xmm0
+	punpcklbw xmm0, xmm7
+	punpckhbw xmm1, xmm7
+
+	paddw xmm0, xmm1			; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+	pshufd xmm1, xmm0, 04eh		; 01001110, w3w2w1w0,w7w6w5w4
+	paddw xmm0, xmm1			; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+	pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+	paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+	pshuflw xmm1, xmm0, 0b1h	; 10110001
+	paddw xmm0, xmm1			; sum in word unit (x8)	
+	movd edx, xmm0
+	and edx, 0ffffh
+	
+	add edx, 08h
+	sar edx, 04h
+	mov dh, dl
+	mov ecx, edx
+	shl ecx, 010h
+	or edx, ecx
+	movd xmm1, edx	
+	pshufd xmm0, xmm1, 00h
+	movdqa xmm1, xmm0
+	
+	lea ecx, [2*ebx+ebx]		; 3*kiStride
+	
+	movdqa [eax], xmm0
+	movdqa [eax+ebx], xmm1
+	movdqa [eax+2*ebx], xmm0
+	movdqa [eax+ecx], xmm1
+	
+	lea eax, [eax+4*ebx]
+	movdqa [eax], xmm0
+	movdqa [eax+ebx], xmm1
+	movdqa [eax+2*ebx], xmm0
+	movdqa [eax+ecx], xmm1
+	
+	lea eax, [eax+4*ebx]
+	movdqa [eax], xmm0
+	movdqa [eax+ebx], xmm1
+	movdqa [eax+2*ebx], xmm0
+	movdqa [eax+ecx], xmm1
+	
+	lea eax, [eax+4*ebx]
+	movdqa [eax], xmm0
+	movdqa [eax+ebx], xmm1
+	movdqa [eax+2*ebx], xmm0
+	movdqa [eax+ecx], xmm1
+	
+	%undef PUSH_SIZE
+	pop ebx
+	ret
+
+ALIGN 16
+;*******************************************************************************
+;	void_t WelsI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
+WelsI16x16LumaPredDcNA_sse2:
+	push ebx
+	
+	%define PUSH_SIZE	4
+	
+	mov eax, [esp+PUSH_SIZE+4]	; pPred
+	mov ebx, [esp+PUSH_SIZE+8]	; kiStride	
+	
+	lea ecx, [2*ebx+ebx]		; 3*kiStride
+	
+	movdqa xmm0, [sse2_dc_0x80]
+	movdqa xmm1, xmm0	
+	movdqa [eax], xmm0
+	movdqa [eax+ebx], xmm1
+	movdqa [eax+2*ebx], xmm0
+	movdqa [eax+ecx], xmm1	
+	lea eax, [eax+4*ebx]
+	movdqa [eax], xmm0
+	movdqa [eax+ebx], xmm1
+	movdqa [eax+2*ebx], xmm0
+	movdqa [eax+ecx], xmm1	
+	lea eax, [eax+4*ebx]
+	movdqa [eax], xmm0
+	movdqa [eax+ebx], xmm1
+	movdqa [eax+2*ebx], xmm0
+	movdqa [eax+ecx], xmm1	
+	lea eax, [eax+4*ebx]
+	movdqa [eax], xmm0
+	movdqa [eax+ebx], xmm1
+	movdqa [eax+2*ebx], xmm0
+	movdqa [eax+ecx], xmm1
+	
+	%undef PUSH_SIZE
+	
+	pop ebx
+	ret
+	
+ALIGN 16
+;*******************************************************************************
+;	void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsIChromaPredDcLeft_mmx
+WelsIChromaPredDcLeft_mmx:
+	push ebx
+	push esi	
+	%define PUSH_SIZE 8
+	mov esi, [esp+PUSH_SIZE+4]	; pPred
+	mov ecx, [esp+PUSH_SIZE+8]	; kiStride
+	mov eax, esi
+	; for left	
+	dec eax
+	xor ebx, ebx
+	xor edx, edx
+	mov bl, [eax]
+	mov dl, [eax+ecx]
+	add ebx, edx
+	lea eax, [eax+2*ecx]
+	mov dl, [eax]
+	add ebx, edx	
+	mov dl, [eax+ecx]
+	add ebx, edx
+	add ebx, 02h
+	sar ebx, 02h
+	mov bh, bl
+	movd mm1, ebx
+	pshufw mm0, mm1, 00h	; up64
+	movq mm1, mm0
+	xor ebx, ebx
+	lea eax, [eax+2*ecx]
+	mov bl, [eax]
+	mov dl, [eax+ecx]
+	add ebx, edx
+	lea eax, [eax+2*ecx]
+	mov dl, [eax]
+	add ebx, edx
+	mov dl, [eax+ecx]
+	add ebx, edx
+	add ebx, 02h
+	sar ebx, 02h
+	mov bh, bl
+	movd mm3, ebx
+	pshufw mm2, mm3, 00h	; down64
+	movq mm3, mm2
+	lea ebx, [2*ecx+ecx]
+	movq [esi], mm0
+	movq [esi+ecx], mm1
+	movq [esi+2*ecx], mm0
+	movq [esi+ebx], mm1
+	lea esi, [esi+4*ecx]
+	movq [esi], mm2
+	movq [esi+ecx], mm3
+	movq [esi+2*ecx], mm2
+	movq [esi+ebx], mm3
+	pop esi
+	pop ebx
+	emms
+	ret
+
+ALIGN 16
+;*******************************************************************************
+;	void_t WelsIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsIChromaPredDcTop_sse2
+WelsIChromaPredDcTop_sse2:
+	push ebx
+	%define PUSH_SIZE 4
+	mov eax, [esp+PUSH_SIZE+4]	; pPred
+	mov ecx, [esp+PUSH_SIZE+8]	; kiStride
+	mov ebx, ecx
+	neg ebx
+	movq xmm0, [eax+ebx]		; top: 8x1 pixels
+	pxor xmm7, xmm7
+	punpcklbw xmm0, xmm7		; ext 8x2 words
+	pshufd xmm1, xmm0, 0B1h		; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
+	paddw xmm0, xmm1			; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
+	movdqa xmm1, xmm0
+	pshuflw xmm2, xmm0, 0B1h	; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
+	pshufhw xmm3, xmm1, 0B1h	; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
+	paddw xmm0, xmm2			; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
+	paddw xmm1, xmm3			; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
+	punpckhqdq xmm1, xmm7
+	punpcklqdq xmm0, xmm1		; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+	movdqa xmm6, [sse2_wd_0x02]
+	paddw xmm0, xmm6
+	psraw xmm0, 02h
+	packuswb xmm0, xmm7	
+	lea ebx, [2*ecx+ecx]
+	movq [eax], xmm0
+	movq [eax+ecx], xmm0
+	movq [eax+2*ecx], xmm0
+	movq [eax+ebx], xmm0
+	lea eax, [eax+4*ecx]
+	movq [eax], xmm0
+	movq [eax+ecx], xmm0
+	movq [eax+2*ecx], xmm0
+	movq [eax+ebx], xmm0
+	%undef PUSH_SIZE
+	pop ebx	
+	ret
+
+	
+ALIGN 16
+;*******************************************************************************
+;	void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
+;*******************************************************************************
+WELS_EXTERN WelsIChromaPredDcNA_mmx
+WelsIChromaPredDcNA_mmx:
+	push ebx
+	%define PUSH_SIZE 4
+	mov eax, [esp+PUSH_SIZE+4]	; pPred
+	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+	lea ecx, [2*ebx+ebx]
+	movq mm0, [sse2_dc_0x80]
+	movq mm1, mm0
+	movq [eax], mm0
+	movq [eax+ebx], mm1
+	movq [eax+2*ebx], mm0
+	movq [eax+ecx], mm1
+	lea eax, [eax+4*ebx]
+	movq [eax], mm0
+	movq [eax+ebx], mm1
+	movq [eax+2*ebx], mm0
+	movq [eax+ecx], mm1
+	%undef PUSH_SIZE
+	pop ebx
+	emms
+	ret
+
+
+	
--- /dev/null
+++ b/codec/decoder/core/asm/mb_copy.asm
@@ -1,0 +1,330 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mb_copy.asm
+;*
+;*  Abstract
+;*      mb_copy and mb_copy1
+;*
+;*  History
+;*      15/09/2009 Created
+;*		12/28/2009 Modified with larger throughput
+;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, 
+;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*
+;*
+;*********************************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+;SECTION .rodata data align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN PixelAvgWidthEq4_mmx
+WELS_EXTERN PixelAvgWidthEq8_mmx
+WELS_EXTERN PixelAvgWidthEq16_sse2
+
+WELS_EXTERN McCopyWidthEq4_mmx
+WELS_EXTERN McCopyWidthEq8_mmx
+WELS_EXTERN McCopyWidthEq16_sse2
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
+;                           uint8_t *pSrcA, int iSrcAStride,
+;                           uint8_t *pSrcB, int iSrcBStride,
+;                           int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq4_mmx:
+   
+    push        esi
+    push        edi
+    push        ebp
+    push        ebx
+
+    mov         edi, [esp+20]       ; pDst
+    mov         eax, [esp+24]       ; iDstStride
+    mov         esi, [esp+28]       ; pSrcA
+    mov         ecx, [esp+32]       ; iSrcAStride
+    mov         ebp, [esp+36]       ; pSrcB
+    mov         edx, [esp+40]       ; iSrcBStride
+    mov         ebx, [esp+44]       ; iHeight
+ALIGN 4
+.height_loop:
+	movd        mm0, [ebp]
+    pavgb       mm0, [esi]
+    movd        [edi], mm0
+   
+    dec         ebx
+    lea         edi, [edi+eax]
+    lea         esi, [esi+ecx]
+    lea         ebp, [ebp+edx]
+    jne         .height_loop
+
+	WELSEMMS
+    pop         ebx
+    pop         ebp
+    pop         edi
+    pop         esi
+    ret
+                          
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
+;                           uint8_t *pSrcA, int iSrcAStride,
+;                           uint8_t *pSrcB, int iSrcBStride,
+;                           int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq8_mmx:
+    
+    push        esi
+    push        edi
+    push        ebp
+    push        ebx
+
+    mov         edi, [esp+20]       ; pDst
+    mov         eax, [esp+24]       ; iDstStride
+    mov         esi, [esp+28]       ; pSrcA
+    mov         ecx, [esp+32]       ; iSrcAStride
+    mov         ebp, [esp+36]       ; pSrcB
+    mov         edx, [esp+40]       ; iSrcBStride
+    mov         ebx, [esp+44]       ; iHeight
+ALIGN 4
+.height_loop:
+	movq        mm0, [esi]
+    pavgb       mm0, [ebp]
+    movq        [edi], mm0
+    movq        mm0, [esi+ecx]
+    pavgb       mm0, [ebp+edx]
+    movq		[edi+eax], mm0
+    
+    lea			esi,  [esi+2*ecx]
+    lea			ebp, [ebp+2*edx]
+    lea			edi,  [edi+2*eax]
+    
+    sub           ebx, 2
+    jnz         .height_loop
+	
+	WELSEMMS
+    pop         ebx
+    pop         ebp
+    pop         edi
+    pop         esi
+    ret
+
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
+;                          uint8_t *pSrcA, int iSrcAStride,
+;                          uint8_t *pSrcB, int iSrcBStride,
+;                          int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq16_sse2:
+    push        esi
+    push        edi
+    push        ebp
+    push        ebx
+    
+
+    mov         edi, [esp+20]       ; pDst
+    mov         eax, [esp+24]       ; iDstStride
+    mov         esi, [esp+28]       ; pSrcA
+    mov         ecx, [esp+32]       ; iSrcAStride
+    mov         ebp, [esp+36]       ; pSrcB
+    mov         edx, [esp+40]       ; iSrcBStride
+    mov         ebx, [esp+44]       ; iHeight
+ALIGN 4
+.height_loop:
+	movdqu      xmm0, [esi]
+	pavgb         xmm0, [ebp]
+    movdqu      [edi], xmm0
+    
+	movdqu      xmm0, [esi+ecx]
+	pavgb         xmm0, [ebp+edx]
+    movdqu      [edi+eax], xmm0
+	
+	movdqu      xmm0, [esi+2*ecx]
+	pavgb         xmm0, [ebp+2*edx]
+    movdqu      [edi+2*eax], xmm0
+    
+    lea              esi,  [esi+2*ecx]
+    lea			   ebp, [ebp+2*edx]
+    lea			   edi,  [edi+2*eax]
+     
+	movdqu      xmm0, [esi+ecx]
+	pavgb         xmm0, [ebp+edx]
+    movdqu      [edi+eax], xmm0
+    
+    lea              esi,  [esi+2*ecx]
+    lea			   ebp, [ebp+2*edx]
+    lea			   edi,  [edi+2*eax]
+	    
+    
+    sub         ebx, 4
+    jne         .height_loop
+
+	WELSEMMS
+	pop         ebx
+    pop         ebp
+    pop         edi
+    pop         esi
+
+    ret
+
+
+ALIGN 16
+;*******************************************************************************
+;  void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
+;                          uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq4_mmx:
+    push    esi
+    push    edi
+    push    ebx
+
+    
+    mov esi,  [esp+16]
+    mov eax, [esp+20]
+    mov edi,  [esp+24]
+    mov ecx,  [esp+28]
+    mov edx,  [esp+32]
+ALIGN 4
+.height_loop:
+	mov ebx, [esi]
+	mov [edi], ebx
+	
+	add esi, eax
+	add edi, ecx
+	dec edx
+	jnz .height_loop
+	WELSEMMS   
+	pop	   ebx
+    pop     edi
+    pop     esi
+    ret
+
+ALIGN 16
+;*******************************************************************************
+;   void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+;                           uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq8_mmx:
+    push  esi
+    push  edi
+	mov  esi, [esp+12]
+	mov eax, [esp+16]
+	mov edi, [esp+20]
+	mov ecx, [esp+24]
+	mov edx, [esp+28]
+
+ALIGN 4
+.height_loop:
+	movq mm0, [esi]
+	movq [edi], mm0
+	add esi, eax
+	add edi, ecx
+	dec edx
+	jnz .height_loop
+	
+	WELSEMMS   
+    pop     edi
+    pop     esi
+    ret
+	
+
+
+
+
+
+
+
+ALIGN 16
+;*******************************************************************************
+;   void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+;read unaligned memory
+%macro SSE_READ_UNA 2
+	movq	%1, [%2]
+	movhps	%1,	[%2+8]
+%endmacro
+
+;write unaligned memory
+%macro SSE_WRITE_UNA 2
+	movq	[%1],	%2
+	movhps	[%1+8], %2
+%endmacro
+McCopyWidthEq16_sse2:
+    push    esi
+    push    edi
+
+    mov     esi, [esp+12]       ; pSrc
+    mov     eax, [esp+16]       ; iSrcStride    
+    mov     edi, [esp+20]       ; pDst
+    mov     edx, [esp+24]       ; iDstStride
+    mov     ecx, [esp+28]       ; iHeight
+
+ALIGN 4
+.height_loop:
+    SSE_READ_UNA	xmm0, esi
+    SSE_READ_UNA	xmm1, esi+eax
+    SSE_WRITE_UNA	edi, xmm0
+    SSE_WRITE_UNA	edi+edx, xmm1
+
+	sub		ecx,	2
+    lea     esi, [esi+eax*2]
+    lea     edi, [edi+edx*2]
+    jnz     .height_loop
+  
+    pop     edi
+    pop     esi
+    ret
--- /dev/null
+++ b/codec/decoder/core/asm/mc_chroma.asm
@@ -1,0 +1,317 @@
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src, 
+;							int32_t iSrcStride, 
+;							uint8_t *pDst, 
+;							int32_t iDstStride, 
+;							uint8_t *pABCD, 
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+	
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3       
+	punpckhwd mm4, mm4		 
+	
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+	
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+	
+	mov esi, [esp +12+ 4]   
+	mov eax, [esp + 12 + 8]   
+	mov edi, [esp + 12 + 12]  
+	mov edx, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+		
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+	
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+	
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+	
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+	
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0	
+
+	movq mm0, mm2
+	
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
+;						int32_t iSrcStride, 
+;						uint8_t *pDst, 
+;						int32_t iDstStride, 
+;						uint8_t *pABCD, 
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+	
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+	
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+	
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+	
+	mov esi, [esp +12+ 4]   
+	mov eax, [esp + 12 + 8]   
+	mov edi, [esp + 12 + 12]  
+	mov edx, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+		
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+	
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+	
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+	
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+	
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0	
+
+	movdqa xmm0, xmm2
+	
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride, 
+;                        uint8_t *pDst,  
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+		
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]   
+    punpcklwd xmm5, xmm5  
+    punpckldq xmm5, xmm5 
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6    
+    
+	mov eax, [esp + 12 + 4]   
+	mov edx, [esp + 12 + 8]   
+	mov esi, [esp + 12 + 12]  
+	mov edi, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+    
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+	
+.hloop_chroma:	
+	lea	esi, [esi+2*edi]
+	
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+	
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0	
+    
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+    
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4	
+	
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- /dev/null
+++ b/codec/decoder/core/asm/mc_luma.asm
@@ -1,0 +1,615 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_luma.asm
+;*
+;*  Abstract
+;*      sse2 motion compensation
+;*
+;*  History
+;*      17/08/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+SECTION .rodata align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+h264_w0x10:
+	dw 16, 16, 16, 16
+
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20WidthEq4_mmx
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc, 
+;                       int iSrcStride, 
+;						uint8_t *pDst, 
+;						int iDstStride, 
+;						int iHeight)
+;*******************************************************************************
+McHorVer20WidthEq4_mmx:
+	push esi
+	push edi
+	
+	mov  esi, [esp+12]
+	mov eax, [esp+16]
+	mov edi, [esp+20]
+	mov ecx, [esp+24]
+	mov edx, [esp+28]
+	sub esi, 2
+	WELS_Zero mm7
+	movq mm6, [h264_w0x10]
+.height_loop:
+	movd mm0, [esi]
+	punpcklbw mm0, mm7
+	movd mm1, [esi+5]
+	punpcklbw mm1, mm7
+	movd mm2, [esi+1]
+	punpcklbw mm2, mm7
+	movd mm3, [esi+4]
+	punpcklbw mm3, mm7
+	movd mm4, [esi+2]
+	punpcklbw mm4, mm7
+	movd mm5, [esi+3]
+	punpcklbw mm5, mm7
+	
+	paddw mm2, mm3
+	paddw mm4, mm5
+	psllw mm4, 2
+	psubw mm4, mm2
+	paddw mm0, mm1
+	paddw mm0, mm4
+	psllw mm4, 2
+	paddw mm0, mm4
+	paddw mm0, mm6
+	psraw mm0, 5
+	packuswb mm0, mm7
+	movd [edi], mm0
+	
+	add esi, eax
+	add edi, ecx
+	dec edx
+	jnz .height_loop
+	
+	WELSEMMS
+	pop edi
+	pop esi
+	ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+
+%macro SSE_LOAD_8P 3
+	movq %1, %3
+	punpcklbw %1, %2
+%endmacro
+
+%macro FILTER_HV_W8 9
+	paddw	%1, %6
+	movdqa	%8, %3
+	movdqa	%7, %2
+	paddw	%1, [h264_w0x10_1]
+	paddw	%8, %4
+	paddw	%7, %5
+	psllw	%8, 2
+	psubw	%8, %7
+	paddw	%1, %8
+	psllw	%8, 2
+	paddw	%1, %8
+	psraw   %1, 5
+	WELS_Zero %8
+	packuswb %1, %8
+	movq    %9, %1
+%endmacro
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+SECTION .rodata align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+h264_w0x10_1:
+	dw 16, 16, 16, 16, 16, 16, 16, 16
+ALIGN 16
+h264_mc_hc_32:
+dw 32, 32, 32, 32, 32, 32, 32, 32
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
+WELS_EXTERN McHorVer22VerLast_sse2
+WELS_EXTERN McHorVer02WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq16_sse2
+
+ALIGN 16
+;***********************************************************************
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc, 
+;                       int16_t iSrcStride, 
+;						uint8_t *pDst,
+;						int32_t iDstStride
+;						int32_t iHeight
+;                       )
+;***********************************************************************
+McHorVer22Width8HorFirst_sse2:
+	push esi
+	push edi
+	push ebx
+	mov esi, [esp+16]     ;pSrc
+	mov eax, [esp+20]	;iSrcStride
+	mov edi, [esp+24]		;pDst
+	mov edx, [esp+28]	;iDstStride
+	mov ebx, [esp+32]	;iHeight
+	pxor xmm7, xmm7	
+	
+	sub esi, eax				;;;;;;;;need more 5 lines.
+	sub esi, eax
+		
+.yloop_width_8:
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [edi], xmm0
+		
+	add esi, eax
+	add edi, edx
+	dec ebx
+	jnz .yloop_width_8
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+ALIGN 16
+;***********************************************************************
+;void_t McHorVer22VerLast_sse2(
+;											uint8_t *pSrc, 
+;											int32_t pSrcStride, 
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+%macro FILTER_VER 9
+	paddw  %1, %6
+	movdqa %7, %2
+	movdqa %8, %3
+	
+	
+	paddw %7, %5
+	paddw %8, %4
+	
+	psubw  %1, %7   
+	psraw   %1, 2	  
+	paddw  %1, %8   
+	psubw  %1, %7 
+	psraw   %1, 2	
+	paddw  %8, %1   
+	paddw  %8, [h264_mc_hc_32]
+	psraw   %8, 6
+	packuswb %8, %8
+	movq %9, %8
+%endmacro
+
+McHorVer22VerLast_sse2:
+	push esi
+	push edi
+	push ebx
+	push ebp
+	
+	mov esi, [esp+20]
+	mov eax, [esp+24]
+	mov edi, [esp+28]
+	mov edx, [esp+32]
+	mov ebx, [esp+36]
+	mov ecx, [esp+40]	
+	shr ebx, 3	
+	
+.width_loop:
+	movdqa xmm0, [esi]
+	movdqa xmm1, [esi+eax]
+	lea esi, [esi+2*eax]
+	movdqa xmm2, [esi]
+	movdqa xmm3, [esi+eax]
+	lea esi, [esi+2*eax]
+	movdqa xmm4, [esi]
+	movdqa xmm5, [esi+eax]
+	
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	lea esi, [esi+2*eax]
+	movdqa xmm6, [esi]
+	
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+	
+	add edi, edx
+	sub esi, eax		
+	
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqa xmm6, [esi]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqa xmm7, [esi+eax]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqa xmm0, [esi]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqa xmm1, [esi+eax]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqa xmm2, [esi]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqa xmm3, [esi+eax]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqa xmm4, [esi]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqa xmm5, [esi+eax]
+	jmp near .start
+	
+.x_loop_dec:
+	dec ebx
+	jz near .exit
+	mov esi, [esp+20]
+	mov edi, [esp+28]
+	mov ecx, [esp+40]
+	add esi, 16
+	add edi, 8
+	jmp .width_loop
+	
+	
+	
+.exit:
+	pop ebp
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc, 
+;                       int iSrcStride, 
+;												uint8_t *pDst, 
+;												int iDstStride, 
+;												int iHeight,
+;                      );
+;*******************************************************************************
+McHorVer20WidthEq8_sse2:
+	push	esi
+	push	edi
+	
+	mov esi, [esp + 12]         ;pSrc
+	mov eax, [esp + 16]         ;iSrcStride
+	mov edi, [esp + 20]         ;pDst
+	mov ecx, [esp + 28]         ;iHeight
+	mov edx, [esp + 24]			;iDstStride
+	
+	lea esi, [esi-2]            ;pSrc -= 2;
+	
+	pxor xmm7, xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:	
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
+	dec ecx
+	jnz near .y_loop
+	
+	pop edi
+	pop esi
+	ret
+	
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
+;                       int iSrcStride, 
+;												uint8_t *pDst, 
+;												int iDstStride, 
+;												int iHeight,
+;                      );
+;*******************************************************************************
+McHorVer20WidthEq16_sse2:
+	push	esi
+	push	edi
+	
+
+	mov esi, [esp + 12]         ;pSrc
+	mov eax, [esp + 16]         ;iSrcStride
+	mov edi, [esp + 20]         ;pDst
+	mov ecx, [esp + 28]         ;iHeight
+	mov edx, [esp + 24]			;iDstStride
+	
+	lea esi, [esi-2]            ;pSrc -= 2;
+	
+	pxor xmm7, xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+	
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	movq xmm0, [esi+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3+8]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [edi+8], xmm0
+	
+	lea edi, [edi+edx]	
+	lea esi, [esi+eax]	
+	dec ecx
+	jnz near .y_loop
+	pop edi
+	pop esi
+	ret
+
+
+;*******************************************************************************
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
+;                       int iSrcStride, 
+;                       uint8_t *pDst, 
+;                       int iDstStride, 
+;                       int iHeight )
+;*******************************************************************************
+ALIGN 16
+McHorVer02WidthEq8_sse2:
+	push esi
+	push edi
+	
+	mov esi, [esp + 12]           ;pSrc
+	mov edx, [esp + 16]	          ;iSrcStride
+	mov edi, [esp + 20]           ;pDst
+	mov eax, [esp + 24]           ;iDstStride
+	mov ecx, [esp + 28]           ;iHeight
+
+	sub esi, edx
+	sub esi, edx
+
+	WELS_Zero xmm7
+			
+	SSE_LOAD_8P xmm0, xmm7, [esi]
+	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm2, xmm7, [esi]
+	SSE_LOAD_8P xmm3, xmm7, [esi+edx]
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm4, xmm7, [esi]
+	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
+	
+.start:	
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	jz near .xx_exit
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm6, xmm7, [esi]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
+	dec ecx
+	jz near .xx_exit
+	
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
+	dec ecx
+	jz near .xx_exit
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm0, xmm1, [esi]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
+	dec ecx
+	jz near .xx_exit
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm1, xmm2, [esi+edx]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
+	dec ecx
+	jz near .xx_exit
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm2, xmm3, [esi]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
+	dec ecx
+	jz near .xx_exit
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm3, xmm4, [esi+edx]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
+	dec ecx
+	jz near .xx_exit
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm4, xmm5, [esi]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
+	dec ecx
+	jz near .xx_exit
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
+	jmp near .start
+
+.xx_exit:
+	pop edi
+	pop esi
+	ret
+
+
--- /dev/null
+++ b/codec/decoder/core/asm/memzero.asm
@@ -1,0 +1,135 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  memzero.asm
+;*
+;*  Abstract
+;*      
+;*
+;*  History
+;*      9/16/2009 Created
+;*
+;*
+;*************************************************************************/
+
+BITS 32
+
+%include "asm_inc.asm"
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text			
+		
+ALIGN 16
+;***********************************************************************
+;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
+;***********************************************************************
+WELS_EXTERN WelsPrefetchZero_mmx
+WelsPrefetchZero_mmx:
+	mov  eax,[esp+4]
+	prefetchnta [eax]
+	ret 			
+
+
+ALIGN 16
+;***********************************************************************
+;   void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroAligned64_sse2
+WelsSetMemZeroAligned64_sse2:
+		mov		eax,	[esp + 4]          ; dst
+		mov		ecx,	[esp + 8]
+		neg		ecx
+			
+		pxor	xmm0,		xmm0
+.memzeroa64_sse2_loops:
+		movdqa	[eax],		xmm0
+		movdqa	[eax+16],	xmm0
+		movdqa	[eax+32],	xmm0
+		movdqa	[eax+48],	xmm0
+		add		eax, 0x40
+		
+		add ecx, 0x40
+		jnz near .memzeroa64_sse2_loops
+			
+		ret	
+
+ALIGN 16
+;***********************************************************************
+;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroSize64_mmx
+WelsSetMemZeroSize64_mmx:
+		mov		eax,	[esp + 4]          ; dst
+		mov		ecx,	[esp + 8]
+		neg		ecx
+			
+		pxor	mm0,		mm0
+.memzero64_mmx_loops:
+		movq	[eax],		mm0
+		movq	[eax+8],	mm0
+		movq	[eax+16],	mm0
+		movq	[eax+24],	mm0
+		movq	[eax+32],	mm0
+		movq	[eax+40],	mm0
+		movq	[eax+48],	mm0
+		movq	[eax+56],	mm0		
+		add		eax,		0x40
+		
+		add ecx, 0x40
+		jnz near .memzero64_mmx_loops
+			
+		WELSEMMS	
+		ret	
+	
+ALIGN 16		
+;***********************************************************************
+;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroSize8_mmx
+WelsSetMemZeroSize8_mmx:
+		mov		eax,	[esp + 4]		; dst
+		mov		ecx,	[esp + 8]		; size
+		neg		ecx			
+		pxor	mm0,		mm0
+		
+.memzero8_mmx_loops:
+		movq	[eax],		mm0
+		add		eax,		0x08
+	
+		add		ecx,		0x08
+		jnz near .memzero8_mmx_loops
+		
+		WELSEMMS	
+		ret	
+
+							
--- /dev/null
+++ b/codec/decoder/core/inc/as264_common.h
@@ -1,0 +1,67 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	common.h
+ *
+ * \brief	common flag definitions
+ *
+ * \date	7/6/2009 Created 
+ *************************************************************************************
+ */
+#ifndef WELS_AS264_COMMON_H__
+#define WELS_AS264_COMMON_H__
+#define  NO_WAITING_AU //slice level decoding
+#define  LONG_TERM_REF //for app
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#ifdef X86_ASM
+
+void MemZeroUnalign32Bytes(void *pSrc);
+void MemZeroAlign32Bytes(void *pSrc);
+void MemZeroUnalign16Bytes(void *pSrc);
+void MemZeroAlign16Bytes(void *pSrc);
+void MemZeroAlign128Bytes(void *pSrc);
+void MemZeroUnalign128Bytes(void *pSrc);
+void MemZeroAlign256Bytes(void *pSrc);
+void MemZeroAlign240Bytes(void *pSrc);
+void MmPrefetch0(char const *kpA);
+
+#endif// X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+
+#endif // WELS_AS264_COMMON_H__
--- /dev/null
+++ b/codec/decoder/core/inc/au_parser.h
@@ -1,0 +1,158 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	au_parser.h
+ *
+ * \brief	Interfaces introduced in Access Unit level based parser
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_ACCESS_UNIT_PARSER_H__
+#define WELS_ACCESS_UNIT_PARSER_H__
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "nal_prefix.h"
+#include "dec_frame.h"
+#include "bit_stream.h"
+#include "parameter_sets.h"
+#include "decoder_context.h"
+
+namespace WelsDec {
+
+/*! 
+ *************************************************************************************
+ * \brief	Start Code Prefix (0x 00 00 00 01) detection
+ *
+ * \param 	pBuf		bitstream payload buffer
+ * \param	pOffset		offset between NAL rbsp and original bitsteam that
+ * 				start code prefix is seperated from. 
+ * \param	iBufSize	count size of buffer
+ *
+ * \return	RBSP buffer of start code prefix exclusive
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+uint8_t* DetectStartCodePrefix( const uint8_t *kpBuf, int32_t *pOffset, int32_t iBufSize );
+
+/*! 
+ *************************************************************************************
+ * \brief	to parse network abstraction layer unit,
+ *			escape emulation_prevention_three_byte within it
+ 	former name is parse_nal
+ *
+ * \param	pCtx		    decoder context
+ * \param 	pNalUnitHeader	parsed result of NAL Unit Header to output
+ * \param   pSrcRbsp        bitstream buffer to input
+ * \param   iSrcRbspLen     length size of bitstream buffer payload
+ * \param	pSrcNal		    
+ * \param	iSrcNalLen		
+ * \param	pConsumedBytes	consumed bytes during parsing
+ *
+ * \return	decoded bytes payload, might be (pSrcRbsp+1) if no escapes 
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+uint8_t* ParseNalHeader( PWelsDecoderContext pCtx, SNalUnitHeader *pNalUnitHeader, uint8_t *pSrcRbsp, int32_t iSrcRbspLen, uint8_t *pSrcNal, int32_t iSrcNalLen, int32_t* pConsumedBytes );		 
+
+int32_t ParseNonVclNal( PWelsDecoderContext pCtx, uint8_t *pRbsp, const int32_t kiSrcLen );
+
+void_t ParseRefBasePicMarking ( PBitStringAux pBs, PRefBasePicMarking pRefBasePicMarking );
+
+void_t ParsePrefixNalUnit ( PWelsDecoderContext pCtx, PBitStringAux pBs );
+
+bool_t CheckAccessUnitBoundary( const PNalUnit kpCurNal, const PNalUnit kpLastNal, const PSps kpSps );
+bool_t CheckAccessUnitBoundaryExt( PNalUnitHeaderExt pLastNalHdrExt, PNalUnitHeaderExt pCurNalHeaderExt, PSliceHeader pLastSliceHeader, PSliceHeader pCurSliceHeader );
+/*! 
+ *************************************************************************************
+ * \brief	to parse Sequence Parameter Set (SPS)
+ *
+ * \param	pCtx		Decoder context
+ * \param	pBsAux		bitstream reader auxiliary 
+ * \param	pPicWidth	picture width current Sps represented
+ * \param	pPicHeight	picture height current Sps represented
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is SPS.
+ *************************************************************************************
+ */
+int32_t ParseSps( PWelsDecoderContext pCtx, PBitStringAux pBsAux, int32_t *pPicWidth, int32_t *pPicHeight );
+
+/*! 
+ *************************************************************************************
+ * \brief	to parse Picture Parameter Set (PPS)
+ *
+ * \param	pCtx		Decoder context
+ * \param 	pPpsList	pps list
+ * \param	pBsAux		bitstream reader auxiliary 
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is PPS.
+ *************************************************************************************
+ */
+int32_t ParsePps( PWelsDecoderContext pCtx, PPps pPpsList, PBitStringAux pBsAux );
+
+/*! 
+ *************************************************************************************
+ * \brief	to parse SEI message payload
+ *
+ * \param 	pSei		sei message to be parsed output
+ * \param	pBsAux		bitstream reader auxiliary 
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is NAL_UNIT_SEI.
+ *************************************************************************************
+ */
+int32_t ParseSei( void_t *pSei, PBitStringAux pBsAux );	// reserved Sei_Msg type
+
+/*!
+ *************************************************************************************
+ * \brief	reset fmo list due to got Sps now
+ *
+ * \param	pCtx	decoder context
+ *
+ * \return	count number of fmo context units are reset
+ *************************************************************************************
+ */
+int32_t ResetFmoList( PWelsDecoderContext pCtx );
+
+} // namespace WelsDec
+
+#endif//WELS_ACCESS_UNIT_PARSER_H__
+
--- /dev/null
+++ b/codec/decoder/core/inc/bit_stream.h
@@ -1,0 +1,77 @@
+/*!
+ * \copy
+ *     Copyright (c)  2004-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+ //bit_stream.h	-	bit-stream reading and / writing auxiliary data
+#ifndef WELS_BIT_STREAM_H__
+#define WELS_BIT_STREAM_H__
+
+#include "typedefs.h"
+
+namespace WelsDec {
+
+/*
+ *	Bit-stream auxiliary reading / writing
+ */
+typedef struct TagBitStringAux {
+	uint8_t		*pStartBuf;	// buffer to start position
+	uint8_t		*pEndBuf;	// buffer + length
+	int32_t     iBits;       // count bits of overall bitstreaming input
+
+	int32_t     iIndex;      //only for cavlc usage
+	uint8_t		*pCurBuf;	// current reading position	
+	uint32_t    uiCurBits;  
+	int32_t		iLeftBits;	// count number of available bits left ([1, 8]),
+	                        // need pointer to next byte start position in case 0 bit left then 8 instead
+}SBitStringAux, *PBitStringAux;
+
+//#pragma pack()
+
+/*!
+ * \brief	input bits for decoder or initialize bitstream writing in encoder
+ *
+ * \param	pBitString	Bit string auxiliary pointer
+ * \param	kpBuf		bit-stream buffer
+ * \param	kiSize	    size in bits for decoder; size in bytes for encoder
+ *
+ * \return	size of buffer data in byte; failed in -1 return
+ */
+int32_t InitBits( PBitStringAux pBitString, const uint8_t *kpBuf, const int32_t kiSize );
+
+void_t InitReadBits( PBitStringAux pBitString );
+
+uint32_t EndianFix(uint32_t uiX);
+
+
+
+} // namespace WelsDec
+
+#endif//WELS_BIT_STREAM_H__
--- /dev/null
+++ b/codec/decoder/core/inc/cpu.h
@@ -1,0 +1,80 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu.h
+ *
+ * \brief	CPU feature compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CPU_DETECTION_H__)
+#define WELS_CPU_DETECTION_H__
+
+#include "typedefs.h"
+
+namespace WelsDec {
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+/*
+ *	cpuid support verify routine
+ *  return 0 if cpuid is not supported by cpu
+ */
+int32_t  WelsCPUIdVerify();
+
+void_t WelsCPUId( uint32_t uiIndex, uint32_t *pFeatureA, uint32_t *pFeatureB, uint32_t *pFeatureC, uint32_t *pFeatureD );
+
+int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx );
+int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx );
+
+void_t WelsEmms();
+
+uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors );
+
+/*
+ *	clear FPU registers states for potential float based calculation if support
+ */
+void     WelsCPURestore( const uint32_t kuiCPU );
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+} // namespace WelsDec
+
+#endif//WELS_CPU_DETECTION_H__
--- /dev/null
+++ b/codec/decoder/core/inc/cpu_core.h
@@ -1,0 +1,80 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu_core.h
+ *
+ * \brief	cpu core feature detection
+ *
+ * \date	4/24/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CPU_CORE_FEATURE_DETECTION_H__)
+#define WELS_CPU_CORE_FEATURE_DETECTION_H__
+
+/*
+ *	WELS CPU feature flags
+ */ 
+#define WELS_CPU_MMX        0x00000001    /* mmx */
+#define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
+#define WELS_CPU_SSE        0x00000004    /* sse */
+#define WELS_CPU_SSE2       0x00000008    /* sse 2 */
+#define WELS_CPU_SSE3       0x00000010    /* sse 3 */
+#define WELS_CPU_SSE41      0x00000020    /* sse 4.1 */
+#define WELS_CPU_3DNOW      0x00000040    /* 3dnow! */
+#define WELS_CPU_3DNOWEXT   0x00000080    /* 3dnow! ext */
+#define WELS_CPU_ALTIVEC    0x00000100    /* altivec */
+#define WELS_CPU_SSSE3      0x00000200    /* ssse3 */
+#define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
+
+/* CPU features application extensive */
+#define WELS_CPU_AVX		0x00000800	/* Advanced Vector eXtentions */
+#define WELS_CPU_FPU		0x00001000	/* x87-FPU on chip */
+#define WELS_CPU_HTT		0x00002000	/* Hyper-Threading Technology (HTT), Multi-threading enabled feature: 
+										   physical processor package is capable of supporting more than one logic processor
+										*/
+#define WELS_CPU_CMOV		0x00004000	/* Conditional Move Instructions,
+										   also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
+										*/
+#define WELS_CPU_MOVBE		0x00008000	/* MOVBE instruction */
+#define WELS_CPU_AES		0x00010000	/* AES instruction extensions */
+#define WELS_CPU_FMA		0x00020000	/* AVX VEX FMA instruction sets */
+
+#define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
+#define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
+#define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
+#define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
+
+/*
+ *	Interfaces for CPU core feature detection as below
+ */
+
+#endif//WELS_CPU_CORE_FEATURE_DETECTION_H__
--- /dev/null
+++ b/codec/decoder/core/inc/crt_util_safe_x.h
@@ -1,0 +1,104 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	crt_util_safe_x.h
+ *
+ * \brief	Safe CRT like util for cross platfroms support
+ *
+ * \date	06/04/2010 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
+#define WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#if defined(WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/timeb.h>
+#include <sys/time.h>
+#include "typedefs.h"
+#endif//WIN32
+
+#include "typedefs.h"
+
+namespace WelsDec {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define     WELS_FILE_SEEK_SET           SEEK_SET
+#define     WELS_FILE_SEEK_CUR           SEEK_CUR
+#define     WESL_FILE_SEEK_END           SEEK_END
+
+typedef      FILE  WelsFileHandle;
+
+#ifdef WIN32
+typedef      struct _timeb     SWelsTime;
+#else 
+typedef      struct timeb      SWelsTime;
+#endif
+
+int32_t   WelsSnprintf( str_t * buffer,  int32_t sizeOfBuffer,  const str_t * format, ... );
+str_t *  WelsStrncpy(str_t * dest, int32_t sizeInBytes, const str_t * src, int32_t count);
+str_t *  WelsStrcat(str_t * dest, int32_t sizeInBytes, str_t * src);
+int32_t   WelsStrnlen(const str_t * str,  int32_t maxlen);
+int32_t   WelsVsprintf(str_t * buffer, int32_t sizeOfBuffer, const str_t * format, va_list argptr);
+
+WelsFileHandle      *  WelsFopen(const str_t * filename,  const str_t * mode);
+int32_t                WelsFclose(WelsFileHandle  * fp);
+int32_t                WelsFread(void * buffer, int32_t size, int32_t count, WelsFileHandle * fp);
+int32_t                WelsFwrite(const void * buffer, int32_t size, int32_t count, WelsFileHandle * fp);
+int32_t                WelsFseek(WelsFileHandle * fp, int32_t offset, int32_t origin);
+int32_t                WelsFflush(WelsFileHandle * fp);
+
+int32_t                WelsGetTimeOfDay(SWelsTime * tp);
+int32_t                WelsStrftime(str_t * buffer, int32_t size, const str_t * format, const SWelsTime * tp);
+uint16_t               WelsGetMillsecond(const SWelsTime * tp);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+} // namespace WelsDec
+
+#endif//WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
--- /dev/null
+++ b/codec/decoder/core/inc/deblocking.h
@@ -1,0 +1,128 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	deblocking.h
+ *
+ * \brief	Interfaces introduced in frame deblocking filtering
+ *
+ * \date	05/14/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_DEBLOCKING_H__
+#define WELS_DEBLOCKING_H__
+
+#include "decoder_context.h"
+
+//#pragma pack(1)
+
+namespace WelsDec {
+
+/*!
+ * \brief	deblocking module initialize 
+ *
+ * \param	pf
+ *          cpu
+ *
+ * \return	NONE
+ */
+
+void_t  DeblockingInit( PDeblockingFunc pDeblockingFunc,  int32_t iCpu );
+
+
+/*!
+ * \brief	deblocking filtering target slice
+ *
+ * \param	dec			Wels decoder context
+ *
+ * \return	NONE
+ */
+void_t WelsDeblockingFilterSlice( PWelsDecoderContext pCtx, PDeblockingFilterMbFunc pDeblockMb );
+
+/*!
+ * \brief	pixel deblocking filtering
+ *
+ * \param	filter			      deblocking filter
+ * \param	pix	                  pixel value
+ * \param	stride	              frame stride
+ * \param	bs	                  boundary strength
+ *
+ * \return	NONE
+ */
+
+uint32_t DeblockingBsMarginalMBAvcbase( PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy);
+
+int32_t DeblockingAvailableNoInterlayer( PDqLayer pCurDqLayer, int32_t iFilterIdc );
+
+void_t DeblockingIntraMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag );
+void_t DeblockingInterMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, uint8_t nBS[2][4][4], int32_t iBoundryFlag );
+
+void_t WelsDeblockingMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag );
+
+void_t DeblockLumaLt4V_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void_t DeblockLumaEq4V_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+
+void_t DeblockLumaLt4H_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void_t DeblockLumaEq4H_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+
+void_t DeblockChromaLt4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void_t DeblockChromaEq4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+
+void_t DeblockChromaLt4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void_t DeblockChromaEq4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#ifdef  X86_ASM
+void DeblockLumaLt4V_sse2( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void DeblockLumaEq4V_sse2( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+void DeblockLumaLt4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc);
+void DeblockLumaEq4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+#endif
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+} // namespace WelsDec
+
+//#pragma pack()
+
+#endif //WELS_DEBLOCKING_H__
+
--- /dev/null
+++ b/codec/decoder/core/inc/dec_frame.h
@@ -1,0 +1,143 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//dec_frame.h
+#ifndef WELS_DEC_FRAME_H__
+#define WELS_DEC_FRAME_H__
+
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "parameter_sets.h"
+#include "nal_prefix.h"
+#include "slice.h"
+#include "picture.h"
+#include "bit_stream.h"
+#include "fmo.h"
+
+namespace WelsDec {
+
+///////////////////////////////////DQ Layer level///////////////////////////////////
+typedef struct TagDqLayer	SDqLayer;
+typedef SDqLayer*			PDqLayer;
+typedef struct TagLayerInfo{
+	SNalUnitHeaderExt		sNalHeaderExt;
+	SSlice					sSliceInLayer;	// Here Slice identify to Frame on concept	
+	PSubsetSps				pSubsetSps;	// current pSubsetSps used, memory alloc in external
+	PSps					pSps;		// current sps based avc used, memory alloc in external
+	PPps					pPps;		// current pps used
+} SLayerInfo, *PLayerInfo;
+/* Layer Representation */
+
+struct TagDqLayer{
+	SLayerInfo			sLayerInfo;	
+		
+	uint8_t				*pCsData[3];	// pointer to reconstructed picture data
+	int32_t				iCsStride[3];	// Cs stride
+	PBitStringAux		pBitStringAux;	// pointer to SBitStringAux
+	PFmo				pFmo;		// Current fmo context pointer used
+	int8_t  *pMbType;
+	int32_t *pSliceIdc;				// using int32_t for slice_idc
+	int16_t	(*pMv[LIST_A])[MB_BLOCK4x4_NUM][MV_A];
+	int8_t	(*pRefIndex[LIST_A])[MB_BLOCK4x4_NUM]; 
+	int8_t  *pLumaQp;
+	int8_t  *pChromaQp;
+	int8_t  *pCbp;
+	int8_t  (*pNzc)[24];
+	int8_t  (*pNzcRs)[24];
+	int8_t  *pResidualPredFlag;
+	int8_t  *pInterPredictionDoneFlag;
+	int16_t (*pScaledTCoeff)[MB_COEFF_LIST_SIZE];
+	int8_t  (*pIntraPredMode)[8]; //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
+	int8_t  (*pIntra4x4FinalMode)[MB_BLOCK4x4_NUM];
+	int8_t  *pChromaPredMode;
+	//uint8_t (*motion_pred_flag[LIST_A])[MB_PARTITION_SIZE]; // 8x8
+	int8_t  (*pSubMbType)[MB_SUB_PARTITION_SIZE];
+	int32_t iLumaStride;
+	int32_t iChromaStride;
+	uint8_t *pPred[3];
+	int32_t iMbX;
+	int32_t iMbY;
+	int32_t iMbXyIndex;
+	int32_t	iMbWidth;		// MB width of this picture, equal to sSps.iMbWidth
+	int32_t	iMbHeight;		// MB height of this picture, equal to sSps.iMbHeight;
+
+	/* Common syntax elements across all slices of a DQLayer */
+	int32_t					iSliceIdcBackup;
+	uint32_t				uiSpsId;
+	uint32_t				uiPpsId;
+	uint32_t				uiDisableInterLayerDeblockingFilterIdc;
+	int32_t					iInterLayerSliceAlphaC0Offset;
+	int32_t					iInterLayerSliceBetaOffset;	
+	//SPosOffset			sScaledRefLayer;
+	int32_t					iSliceGroupChangeCycle;
+	PRefPicListReorderSyn	pRefPicListReordering;
+	PRefPicMarking          pRefPicMarking; // Decoded reference picture marking syntaxs
+	PRefBasePicMarking	    pRefPicBaseMarking;
+
+	PPicture				pRef;			// reference picture pointer
+	PPicture				pDec;			// reconstruction picture pointer for layer
+
+	bool_t					bStoreRefBasePicFlag;				// iCurTid == 0 && iCurQid = 0 && bEncodeKeyPic = 1
+	bool_t					bTCoeffLevelPredFlag;
+	bool_t					bConstrainedIntraResamplingFlag;
+	uint8_t					uiRefLayerDqId;
+	uint8_t					uiRefLayerChromaPhaseXPlus1Flag;
+	uint8_t					uiRefLayerChromaPhaseYPlus1;
+	uint8_t					uiLayerDqId;			// dq_id of current layer
+	bool_t					bUseRefBasePicFlag;	// whether reference pic or reference base pic is referred?
+};
+
+typedef struct TagGpuAvcLayer{
+	SLayerInfo				sLayerInfo;	
+	PBitStringAux			pBitStringAux;	// pointer to SBitStringAux
+
+	int8_t					*pMbType;
+	int32_t					*pSliceIdc;	// using int32_t for slice_idc
+	int8_t					*pLumaQp;
+	int8_t					*pCbp;
+	int8_t					(*pNzc)[24];
+	int8_t					(*pIntraPredMode)[8]; //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
+
+	int32_t					iMbX;
+	int32_t					iMbY;
+	int32_t					iMbXyIndex;
+	int32_t					iMbWidth;		// MB width of this picture, equal to sSps.iMbWidth
+	int32_t					iMbHeight;		// MB height of this picture, equal to sSps.iMbHeight;
+
+}SGpuAvcDqLayer, *PGpuAvcDqLayer;
+
+///////////////////////////////////////////////////////////////////////
+
+} // namespace WelsDec
+
+#endif//WELS_DEC_FRAME_H__
--- /dev/null
+++ b/codec/decoder/core/inc/dec_golomb.h
@@ -1,0 +1,243 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	golomb.h
+ *
+ * \brief	Exponential Golomb entropy coding/decoding routine
+ *
+ * \date	03/13/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_EXPONENTIAL_GOLOMB_ENTROPY_CODING_H__
+#define WELS_EXPONENTIAL_GOLOMB_ENTROPY_CODING_H__
+
+#include "typedefs.h"
+#include "bit_stream.h"
+#include "macros.h"
+//#include <assert.h>
+#include "ls_defines.h"
+
+namespace WelsDec {
+
+#define GET_WORD(iCurBits, pBufPtr, iLeftBits) { \
+	iCurBits |= ((pBufPtr[0] << 8) | pBufPtr[1]) << (iLeftBits); \
+	iLeftBits -= 16; \
+	pBufPtr +=2; \
+} 
+#define NEED_BITS(iCurBits, pBufPtr, iLeftBits) { \
+	if( iLeftBits > 0 ) { \
+	GET_WORD(iCurBits, pBufPtr, iLeftBits); \
+	} \
+} 
+#define UBITS(iCurBits, iNumBits) (iCurBits>>(32-(iNumBits)))  
+#define DUMP_BITS(iCurBits, pBufPtr, iLeftBits, iNumBits) { \
+	iCurBits <<= (iNumBits); \
+	iLeftBits += (iNumBits); \
+	NEED_BITS(iCurBits, pBufPtr, iLeftBits); \
+}  
+
+static inline int32_t ShowBits( PBitStringAux pBs, int32_t iNumBits )
+{
+	return UBITS( pBs->uiCurBits, iNumBits );
+} 
+static inline void_t FlushBits( PBitStringAux pBs, int32_t iNumBits )
+{
+	DUMP_BITS( pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iNumBits );
+} 
+static inline int32_t BsGetBits( PBitStringAux pBs, int32_t iNumBits )
+{
+	int32_t iRc = UBITS( pBs->uiCurBits, iNumBits );
+	DUMP_BITS( pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iNumBits );
+	return iRc;
+}   
+
+/*
+ *	Exponential Golomb codes decoding routines
+ */
+
+// for data sharing cross modules and try to reduce size of binary generated, 12/10/2009
+extern const uint8_t g_kuiIntra4x4CbpTable[48];
+extern const uint8_t g_kuiInterCbpTable[48];
+
+extern const uint8_t g_kuiLeadingZeroTable[256];
+
+static const uint32_t g_kuiPrefix8BitsTable[16] =
+{
+	0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+
+static inline uint32_t GetPrefixBits(uint32_t uiValue)
+{
+	uint32_t iNumBit = 0;	
+
+	if (uiValue & 0xffff0000) 
+	{
+		uiValue >>= 16;
+		iNumBit += 16;
+	}
+	if (uiValue & 0xff00) 
+	{
+		uiValue >>= 8;
+		iNumBit += 8;
+	}
+
+	if (uiValue & 0xf0)
+	{
+		uiValue >>= 4;
+		iNumBit += 4;
+	}
+	iNumBit += g_kuiPrefix8BitsTable[uiValue];
+
+	return (32-iNumBit);
+}
+
+/*
+ *	Read one bit from bit stream followed
+ */
+static inline uint32_t BsGetOneBit(PBitStringAux pBs)
+{
+	return ( BsGetBits(pBs, 1) );
+}
+
+static inline int32_t GetLeadingZeroBits( uint32_t iCurBits ) //<=16 bits 
+{
+	int32_t  iValue; 
+
+	iValue = UBITS( iCurBits, 8 );//ShowBits( bs, 8 );
+	if( iValue )
+	{
+		return g_kuiLeadingZeroTable[iValue];
+	}
+
+	iValue = UBITS( iCurBits, 16 );//ShowBits( bs, 16 );
+	if( iValue )
+	{
+		return (g_kuiLeadingZeroTable[iValue] + 8);
+	}
+
+	//ASSERT(FALSE);  // should not go here
+	return -1;
+}
+
+static inline uint32_t BsGetUe( PBitStringAux pBs )
+{
+	uint32_t iValue = 0;
+	int32_t  iLeadingZeroBits = GetLeadingZeroBits( pBs->uiCurBits );
+
+	if ( iLeadingZeroBits == -1 ) //bistream error
+	{
+		return 0xffffffff;//-1
+	}	
+	
+	DUMP_BITS( pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iLeadingZeroBits + 1 );
+
+	if( iLeadingZeroBits )
+	{
+		iValue = UBITS( pBs->uiCurBits, iLeadingZeroBits );
+		DUMP_BITS( pBs->uiCurBits, pBs->pCurBuf, pBs->iLeftBits, iLeadingZeroBits );
+	}
+
+	return ((1<<iLeadingZeroBits) - 1 + iValue);		
+}
+
+
+/*
+ *	Read signed exp golomb codes
+ */
+static inline int32_t BsGetSe(PBitStringAux pBs)
+{
+	uint32_t uiCodeNum;
+	
+	uiCodeNum = BsGetUe( pBs );	
+
+	if(uiCodeNum&0x01)							
+	{
+		return (int32_t)((uiCodeNum+1)>>1);		
+	}
+	else      
+	{
+		return NEG_NUM( (int32_t)(uiCodeNum>>1) );
+	}
+}
+
+/*
+ *	Read truncated exp golomb codes
+ */
+static inline uint32_t BsGetTe(PBitStringAux pBs, uint8_t uiRange)
+{
+	if ( 1 == uiRange )
+	{
+		return BsGetOneBit(pBs)^1;
+	}
+	else
+	{	
+		return BsGetUe(pBs);
+	}
+}
+
+/*
+ * Get unsigned truncated exp golomb code.
+ */
+static inline int32_t BsGetTe0(PBitStringAux pBs, int32_t iRange)
+{
+	if(iRange==1)
+		return 0;
+	else if(iRange==2)
+		return BsGetOneBit(pBs)^1;
+	else
+		return BsGetUe(pBs);
+}
+
+/*
+ *	Get number of trailing bits
+ */
+static inline int32_t BsGetTrailingBits( uint8_t *pBuf )
+{
+	// TODO
+	uint32_t uiValue = *pBuf;
+    int32_t iRetNum = 1;
+	
+	do 
+	{
+		if (uiValue&1)
+			return iRetNum;
+		uiValue >>= 1;
+		++ iRetNum;
+	} while(iRetNum < 9);
+	
+	return 0;
+}
+
+} // namespace WelsDec
+
+#endif//WELS_EXPONENTIAL_GOLOMB_ENTROPY_CODING_H__
--- /dev/null
+++ b/codec/decoder/core/inc/decode_mb_aux.h
@@ -1,0 +1,61 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_DECODE_MB_AUX_H__
+#define WELS_DECODE_MB_AUX_H__
+
+#include "typedefs.h"
+#include "macros.h"
+
+namespace WelsDec {
+
+void_t InitDctClipTable(void_t);
+
+void_t IdctResAddPred_c(uint8_t *pPred, const int32_t kiStride, int16_t *pRs);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+    void_t IdctResAddPred_mmx(uint8_t *pPred, const int32_t kiStride, int16_t *pRs);
+#endif//X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+void_t GetI4LumaIChromaAddrTable(int32_t *pBlockOffset, const int32_t kiYStride, const int32_t kiUVStride);
+
+} // namespace WelsDec
+
+#endif//WELS_DECODE_MB_AUX_H__
--- /dev/null
+++ b/codec/decoder/core/inc/decode_slice.h
@@ -1,0 +1,90 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_DECODE_SLICE_H__
+#define WELS_DECODE_SLICE_H__
+
+#include "decoder_context.h"
+
+namespace WelsDec {
+
+void_t WelsBlockInit(int16_t* pBlock, int32_t iWidth, int32_t iHeight, int32_t iStride, uint8_t uiVal);
+
+int32_t WelsActualDecodeMbCavlcISlice  (PWelsDecoderContext pCtx);
+int32_t WelsDecodeMbCavlcISlice        (PWelsDecoderContext pCtx, PNalUnit pNalCur);
+
+int32_t WelsActualDecodeMbCavlcPSlice  (PWelsDecoderContext pCtx);
+int32_t WelsDecodeMbCavlcPSlice        (PWelsDecoderContext pCtx, PNalUnit pNalCur);
+typedef int32_t (*PWelsDecMbCavlcFunc) (PWelsDecoderContext pCtx, PNalUnit pNalCur);
+
+int32_t WelsTargetSliceConstruction(PWelsDecoderContext pCtx); //construction based on slice
+
+int32_t WelsDecodeSlice(PWelsDecoderContext pCtx, bool_t bFirstSliceInLayer, PNalUnit pNalCur);
+
+
+int32_t WelsTargetMbConstruction(PWelsDecoderContext pCtx);
+
+int32_t WelsMbIntraPredictionConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer, bool_t bOutput);
+int32_t WelsMbInterSampleConstruction( PWelsDecoderContext pCtx, PDqLayer pCurLayer, 
+											  uint8_t* pDstY, uint8_t* pDstU, uint8_t* pDstV, int32_t iStrideL, int32_t iStrideC );
+int32_t WelsMbInterConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer);
+void_t WelsLumaDcDequantIdct(int16_t *pBlock, int32_t iQp);
+int32_t WelsMbInterPrediction  (PWelsDecoderContext pCtx, PDqLayer pCurLayer);
+void_t WelsMbCopy( uint8_t *pDst, int32_t iStrideDst, uint8_t *pSrc, int32_t iStrideSrc, 
+				 int32_t iHeight, int32_t iWidth );
+
+void_t WelsChromaDcIdct( int16_t *pBlock );
+
+#ifdef __cplusplus
+extern "C" {
+#endif//__cplusplus
+
+#ifdef  X86_ASM
+void_t WelsResBlockZero16x16_sse2 (int16_t* pBlock, int32_t iStride);
+void_t WelsResBlockZero8x8_sse2   (int16_t* pBlock, int32_t iStride);
+#endif
+
+#ifdef __cplusplus
+}
+#endif//__cplusplus
+
+void_t WelsBlockZero16x16_c(int16_t * pBlock, int32_t iStride);
+void_t WelsBlockZero8x8_c  (int16_t * pBlock, int32_t iStride);
+void_t SetNonZeroCount_c   (int16_t * pBlock, int8_t * pNonZeroCount);
+
+void_t WelsBlockFuncInit(SBlockFunc *pFunc,  int32_t iCpu);
+
+} // namespace WelsDec
+
+#endif //WELS_DECODE_SLICE_H__
+
+
--- /dev/null
+++ b/codec/decoder/core/inc/decoder.h
@@ -1,0 +1,151 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	decoder.h
+ *
+ * \brief	Interfaces introduced in decoder system architecture
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_DECODER_SYSTEM_ARCHITECTURE_H__
+#define WELS_DECODER_SYSTEM_ARCHITECTURE_H__
+
+#include "typedefs.h"
+#include "decoder_context.h"
+
+namespace WelsDec {
+
+#ifdef __cplusplus
+extern "C" {
+#endif//__cplusplus
+
+/*!
+ * \brief	configure decoder parameters	
+ */
+int32_t DecoderConfigParam ( PWelsDecoderContext pCtx, const void_t* kpParam );
+
+/*! 
+ *************************************************************************************
+ * \brief	Initialize Wels decoder parameters and memory
+ *
+ * \param 	pCtx	        input context to be initialized at first stage 
+ * \param   pTraceHandle    handle for trace
+ * \param   pLo             log info pointer
+ *
+ * \return	0 - successed
+ * \return	1 - failed
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+int32_t WelsInitDecoder( PWelsDecoderContext pCtx,  void_t * pTraceHandle, PWelsLogCallbackFunc pLog );
+
+/*! 
+ *************************************************************************************
+ * \brief	Uninitialize Wels decoder parameters and memory
+ *
+ * \param 	pCtx	input context to be uninitialized at release stage 
+ *
+ * \return	NONE
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+void_t WelsEndDecoder( PWelsDecoderContext pCtx );
+
+/*! 
+ *************************************************************************************
+ * \brief	First entrance to decoding core interface.
+ *
+ * \param 	pCtx	        decoder context
+ * \param	pBufBs	        bit streaming buffer
+ * \param	kBsLen	        size in bytes length of bit streaming buffer input
+ * \param	ppDst	        picture payload data to be output
+ * \param	pDstBufInfo	    buf information of ouput data
+ *
+ * \return	0 - successed
+ * \return	1 - failed
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+
+int32_t WelsDecodeBs( PWelsDecoderContext pCtx, const uint8_t *kpBsBuf, const int32_t kiBsLen, 
+					   uint8_t **ppDst, SBufferInfo* pDstBufInfo);
+
+/*
+ *	request memory blocks for decoder avc part
+ */
+int32_t WelsRequestMem( PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight );
+
+
+/*
+ *	free memory blocks in avc
+ */
+void_t WelsFreeMem( PWelsDecoderContext pCtx );
+
+/*
+ * set colorspace format in decoder
+ */
+int32_t DecoderSetCsp(PWelsDecoderContext pCtx, const int32_t kiColorFormat);
+
+/*!
+ * \brief	make sure synchonozization picture resolution (get from slice header) among different parts (i.e, memory related and so on)
+ *			over decoder internal
+ * ( MB coordinate and parts of data within decoder context structure )
+ * \param	pCtx		Wels decoder context
+ * \param	iMbWidth	MB width
+ * \pram	iMbHeight	MB height 
+ * \return	0 - successful; none 0 - something wrong
+ */
+int32_t SyncPictureResolutionExt( PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight );
+
+/*!
+ * \brief	update maximal picture width and height if applicable when receiving a SPS NAL
+ */
+void_t UpdateMaxPictureResolution( PWelsDecoderContext pCtx, const int32_t kiCurWidth, const int32_t kiCurHeight );
+
+void_t AssignFuncPointerForRec( PWelsDecoderContext pCtx );
+
+void_t ResetParameterSetsState( PWelsDecoderContext pCtx );
+
+void_t GetVclNalTemporalId( PWelsDecoderContext pCtx );//get the info that whether or not have VCL NAL in current AU,
+                                                            //and if YES, get the temporal ID
+
+#ifdef __cplusplus
+}
+#endif//__cplusplus
+
+} // namespace WelsDec
+
+#endif//WELS_DECODER_SYSTEM_ARCHITECTURE_H__
--- /dev/null
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -1,0 +1,340 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	decoder_context.h
+ *
+ * \brief	mainly interface introduced in Wels decoder side
+ *
+ * \date	3/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_DECODER_FRAMEWORK_H__
+#define WELS_DECODER_FRAMEWORK_H__
+#include "typedefs.h"
+#include "utils.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "codec_app_def.h"
+#include "parameter_sets.h"
+#include "nalu.h"
+#include "dec_frame.h"
+#include "pic_queue.h"
+#include "vlc_decoder.h"
+#include "fmo.h"
+#include "as264_common.h" // for LONG_TERM_REF macro,can be delete if not need this macro
+#include "crt_util_safe_x.h"
+#include "mb_cache.h"
+
+namespace WelsDec {
+
+#ifndef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+//#define MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+
+typedef struct TagDataBuffer
+{
+	uint8_t* pHead;
+	uint8_t* pEnd;
+
+	uint8_t* pStartPos;
+	uint8_t* pCurPos;
+}SDataBuffer;
+
+//#ifdef __cplusplus
+//extern "C" {
+//#endif//__cplusplus
+
+//#pragma pack(1)
+
+/*
+ *	Need move below structures to function pointer to seperate module/file later  
+ */
+
+//typedef int32_t (*rec_mb) (Mb *cur_mb, PWelsDecoderContext pCtx);
+
+/*typedef for get intra predictor func pointer*/
+typedef void_t (*PGetIntraPredFunc)(uint8_t *pPred, const int32_t kiLumaStride);
+typedef void_t (*PIdctResAddPredFunc)(uint8_t *pPred, const int32_t kiStride, int16_t *pRs);
+typedef void_t (*PExpandPictureFunc)( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicWidth, const int32_t kiPicHeight );
+
+/**/
+typedef struct TagRefPic {
+	PPicture			pRefList[LIST_A][MAX_REF_PIC_COUNT];	// reference picture marking plus FIFO scheme
+	PPicture			pShortRefList[LIST_A][MAX_SHORT_REF_COUNT];
+	PPicture			pLongRefList[LIST_A][MAX_LONG_REF_COUNT];
+	uint8_t				uiRefCount[LIST_A]; 
+	uint8_t				uiShortRefCount[LIST_A];
+	uint8_t				uiLongRefCount[LIST_A];	// dependend on ref pic module
+	int32_t				iMaxLongTermFrameIdx;
+} SRefPic, *PRefPic;
+
+typedef void_t (*PWelsMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+						      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight);
+typedef struct TagMcFunc{
+	PWelsMcFunc pMcLumaFunc;
+	PWelsMcFunc pMcChromaFunc;
+}SMcFunc;
+
+//deblock module defination
+struct TagDeblockingFunc;
+
+typedef struct tagDeblockingFilter {
+	uint8_t	*pCsData[3];	// pointer to reconstructed picture data
+	int32_t	iCsStride[2];	// Cs stride
+	ESliceType  eSliceType;
+	int8_t	iSliceAlphaC0Offset;
+	int8_t	iSliceBetaOffset;
+	int8_t  iChromaQP;
+	int8_t  iLumaQP;
+	struct TagDeblockingFunc  *pLoopf;
+}SDeblockingFilter, *PDeblockingFilter;
+
+typedef void_t (*PDeblockingFilterMbFunc)( PDqLayer pCurDqLayer, PDeblockingFilter  filter, int32_t boundry_flag );
+typedef void_t (*PLumaDeblockingLT4Func)( uint8_t *iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc );
+typedef void_t (*PLumaDeblockingEQ4Func)(  uint8_t *iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+typedef void_t (*PChromaDeblockingLT4Func)( uint8_t *iSampleCb, uint8_t *iSampleCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc );
+typedef void_t (*PChromaDeblockingEQ4Func)(  uint8_t *iSampleCb, uint8_t *iSampleCr, int32_t iStride, int32_t iAlpha, int32_t iBeta  );
+
+typedef struct TagDeblockingFunc {
+	PLumaDeblockingLT4Func    pfLumaDeblockingLT4Ver;
+	PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Ver;
+	PLumaDeblockingLT4Func    pfLumaDeblockingLT4Hor;
+	PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Hor;
+
+	PChromaDeblockingLT4Func  pfChromaDeblockingLT4Ver;
+	PChromaDeblockingEQ4Func  pfChromaDeblockingEQ4Ver;
+	PChromaDeblockingLT4Func  pfChromaDeblockingLT4Hor;
+	PChromaDeblockingEQ4Func  pfChromaDeblockinEQ4Hor;
+} SDeblockingFunc, *PDeblockingFunc;
+
+typedef void_t (*PWelsBlockAddStrideFunc)(uint8_t *pDest, uint8_t *pPred, int16_t *pRes, int32_t iPredStride, int32_t iResStride);
+typedef void_t (*PWelsBlockZeroFunc) (int16_t* pBlock, int32_t iStride);
+typedef void_t (*PWelsNonZeroCountFunc) (int16_t *pBlock, int8_t *pNonZeroCount);
+typedef void_t (*PWelsSimpleIdct4x4AddFunc) (int16_t *pDest, int16_t *pSrc, int32_t iStride);
+
+typedef  struct  TagBlockFunc {
+	PWelsBlockZeroFunc			pWelsBlockZero16x16Func;
+	PWelsBlockZeroFunc			pWelsBlockZero8x8Func;
+	PWelsNonZeroCountFunc		pWelsSetNonZeroCountFunc;  
+} SBlockFunc;
+
+typedef void_t ( *PWelsFillNeighborMbInfoIntra4x4Func )( PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer );
+typedef int32_t (*PWelsParseIntra4x4ModeFunc)          ( PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer);
+typedef int32_t (*PWelsParseIntra16x16ModeFunc)        ( PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
+
+typedef struct TagExpandPicFunc{
+	PExpandPictureFunc pExpandLumaPicture;
+	PExpandPictureFunc pExpandChromaPicture[2];
+}SExpandPicFunc;
+
+/*
+ *	SWelsDecoderContext: to maintail all modules data over decoder@framework
+ */
+
+typedef struct TagWelsDecoderContext {	
+	// Input
+	void_t				*pArgDec;			// structured arguments for decoder, reserved here for extension in the future
+
+	SDataBuffer       	sRawData;
+
+	// Configuration
+	SDecodingParam	    *pParam;
+	uint32_t			uiCpuFlag;			// CPU compatibility detected
+	int32_t 	   		iDecoderMode;		// indicate decoder running mode
+	int32_t				iSetMode;			// indicate decoder mode set from upper layer, this is read-only for decoder internal
+	int32_t 			iDecoderOutputProperty; // indicate the output buffer property
+	int32_t				iModeSwitchType;	// 1: optimal decision; 2: forced switch to the other mode; 0: no switch
+	
+	int32_t				iOutputColorFormat;		// color space format to be outputed
+	VIDEO_BITSTREAM_TYPE eVideoType; //indicate the type of video to decide whether or not to do qp_delta error detection.
+	bool_t				bErrorResilienceFlag;		// error resilience flag
+	bool_t				bHaveGotMemory;	// global memory for decoder context related ever requested?	
+	
+	int32_t				iImgWidthInPixel;	// width of image in pixel reconstruction picture to be output
+	int32_t				iImgHeightInPixel;// height of image in pixel reconstruction picture to be output
+	int32_t				iMaxWidthInSps;	// maximal width of pixel in SPS sets
+	int32_t				iMaxHeightInSps;	// maximal height of pixel in SPS sets
+
+	// Derived common elements
+	SNalUnitHeader		sCurNalHead;
+	ESliceType			eSliceType;			// Slice type
+	int32_t				iFrameNum;
+	int32_t				iPrevFrameNum;		// frame number of previous frame well decoded for non-truncated mode yet
+    bool_t              bLastHasMmco5;      //
+	int32_t				iErrorCode;			// error code return while decoding in case packets lost
+	SFmo				sFmoList[MAX_PPS_COUNT];	// list for FMO storage
+	PFmo				pFmo;				// current fmo context after parsed slice_header
+	int32_t				iActiveFmoNum;		// active count number of fmo context in list
+
+	/*needed info by decode slice level and mb level*/
+	int32_t				iDecBlockOffsetArray[24];	// address talbe for sub 4x4 block in intra4x4_mb, so no need to caculta the address every time.
+
+	struct
+	{
+		int8_t  *pMbType[LAYER_NUM_EXCHANGEABLE];                      /* mb type */
+		int16_t	(*pMv[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM][MV_A]; //[LAYER_NUM_EXCHANGEABLE   MB_BLOCK4x4_NUM*]
+		int8_t	(*pRefIndex[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_BLOCK4x4_NUM]; 
+		int8_t	*pLumaQp[LAYER_NUM_EXCHANGEABLE];	/*mb luma_qp*/
+		int8_t	*pChromaQp[LAYER_NUM_EXCHANGEABLE];					/*mb chroma_qp*/
+		int8_t	(*pNzc[LAYER_NUM_EXCHANGEABLE])[24];
+		int8_t	(*pNzcRs[LAYER_NUM_EXCHANGEABLE])[24];	
+		int16_t (*pScaledTCoeff[LAYER_NUM_EXCHANGEABLE])[MB_COEFF_LIST_SIZE]; /*need be aligned*/
+		int8_t	(*pIntraPredMode[LAYER_NUM_EXCHANGEABLE])[8]; //0~3 top4x4 ; 4~6 left 4x4; 7 intra16x16
+		int8_t  (*pIntra4x4FinalMode[LAYER_NUM_EXCHANGEABLE])[MB_BLOCK4x4_NUM];
+		int8_t  *pChromaPredMode[LAYER_NUM_EXCHANGEABLE];
+		int8_t  *pCbp[LAYER_NUM_EXCHANGEABLE];
+		uint8_t (*pMotionPredFlag[LAYER_NUM_EXCHANGEABLE][LIST_A])[MB_PARTITION_SIZE]; // 8x8
+		int8_t  (*pSubMbType[LAYER_NUM_EXCHANGEABLE])[MB_SUB_PARTITION_SIZE];
+		int32_t *pSliceIdc[LAYER_NUM_EXCHANGEABLE];		// using int32_t for slice_idc
+		int8_t  *pResidualPredFlag[LAYER_NUM_EXCHANGEABLE];	
+		int8_t  *pInterPredictionDoneFlag[LAYER_NUM_EXCHANGEABLE];
+		int16_t iMbWidth;
+		int16_t iMbHeight;
+	}sMb;
+
+
+	// reconstruction picture	
+	PPicture			pDec;			//pointer to current picture being reconstructed
+
+	// reference pictures
+	SRefPic				sRefPic;
+
+	SVlcTable			sVlcTable;		 // vlc table
+	
+	SBitStringAux		sBs;
+
+	/* Global memory external */
+
+	SPosOffset	sFrameCrop;
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+	int32_t             iSpsTotalNum;  //the number of SPS in current IDR interval
+	int32_t             iSubspsTotalNum; //the number of subsps in current IDR interval
+	int32_t             iPpsTotalNum; //the number of PPS in current IDR interval.
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID	
+
+	SSps				sSpsBuffer[MAX_SPS_COUNT];
+	SPps				sPpsBuffer[MAX_PPS_COUNT];
+	PSliceHeader		pSliceHeader;
+
+	PPicBuff	        pPicBuff[LIST_A];	// Initially allocated memory for pictures which are used in decoding.
+	int32_t				iPicQueueNumber;
+	
+	SSubsetSps			sSubsetSpsBuffer[MAX_SPS_COUNT];
+	SNalUnit            sPrefixNal;
+	
+	PAccessUnit			pAccessUnitList;	// current access unit list to be performed
+	PSps				pSps;	// used by current AU
+	PPps				pPps;	// used by current AU
+	// Memory for pAccessUnitList is dynamically held till decoder destruction.
+	PDqLayer			pCurDqLayer;		// current DQ layer representation, also carry reference base layer if applicable
+	PDqLayer			pDqLayersList[LAYER_NUM_EXCHANGEABLE];	// DQ layers list with memory allocated
+	uint8_t				*pCsListXchg[LAYER_NUM_EXCHANGEABLE][3];	// Constructed picture buffer: 0- cur layer, 1- ref layer;
+	int16_t				*pRsListXchg[LAYER_NUM_EXCHANGEABLE][3];// Residual picture buffer: 0- cur layer, 1- ref layer;
+
+	int32_t				iCsStride[3];		// strides for Cs
+	int32_t				iRsStride[3];		// strides for Rs
+
+	int32_t             iPicWidthReq;		// picture width have requested the memory
+	int32_t             iPicHeightReq;		// picture height have requested the memory
+
+	uint8_t				uiTargetDqId;		// maximal DQ ID in current access unit, meaning target layer ID	
+	bool_t				bAvcBasedFlag;		// For decoding bitstream:
+	bool_t				bEndOfStreamFlag;	// Flag on end of stream requested by external application layer
+	bool_t				bInitialDqLayersMem;	// dq layers related memory is available?
+
+	bool_t              bOnlyOneLayerInCurAuFlag; //only one layer in current AU: 1
+	
+	// for EC parameter sets
+	bool_t				bSpsExistAheadFlag;	// whether does SPS NAL exist ahead of sequence?
+	bool_t				bSubspsExistAheadFlag;// whether does Subset SPS NAL exist ahead of sequence?
+	bool_t				bPpsExistAheadFlag;	// whether does PPS NAL exist ahead of sequence?
+
+	bool_t				bSpsAvailFlags[MAX_SPS_COUNT];
+	bool_t				bSubspsAvailFlags[MAX_SPS_COUNT];
+	bool_t				bPpsAvailFlags[MAX_PPS_COUNT];
+	bool_t				bReferenceLostAtT0Flag;
+#ifdef LONG_TERM_REF
+	bool_t				bParamSetsLostFlag;	//sps or pps do not exist or not correct
+
+	bool_t              bCurAuContainLtrMarkSeFlag; //current AU has the LTR marking syntax element, mark the previous frame or self
+	int32_t             iFrameNumOfAuMarkedLtr; //if bCurAuContainLtrMarkSeFlag==true, SHOULD set this variable
+
+	uint16_t            uiCurIdrPicId;
+#endif
+
+	PGetIntraPredFunc 	pGetI16x16LumaPredFunc[7];		//h264_predict_copy_16x16;
+	PGetIntraPredFunc 	pGetI4x4LumaPredFunc[14];		// h264_predict_4x4_t
+	PGetIntraPredFunc 	pGetIChromaPredFunc[7];		// h264_predict_8x8_t
+	PIdctResAddPredFunc	pIdctResAddPredFunc;
+	SMcFunc				sMcFunc;
+	/* For Deblocking */
+	SDeblockingFunc     sDeblockingFunc;
+    SExpandPicFunc	    sExpandPicFunc;
+
+	/* For Block */
+	SBlockFunc          sBlockFunc;
+	/* For EC */
+	int32_t iCurSeqIntervalTargetDependId;
+	int32_t iCurSeqIntervalMaxPicWidth;
+	int32_t iCurSeqIntervalMaxPicHeight;
+	
+	PWelsFillNeighborMbInfoIntra4x4Func  pFillInfoCacheIntra4x4Func;
+	PWelsParseIntra4x4ModeFunc           pParseIntra4x4ModeFunc;
+	PWelsParseIntra16x16ModeFunc         pParseIntra16x16ModeFunc;
+
+	//feedback whether or not have VCL in current AU, and the temporal ID
+	int32_t iFeedbackVclNalInAu;
+	int32_t iFeedbackTidInAu;	
+
+	bool_t bAuReadyFlag;   // TRUE: one au is ready for decoding; FALSE: default value
+	
+	//trace handle
+	void_t   *   pTraceHandle;
+	
+#ifdef NO_WAITING_AU
+	//Save the last nal header info
+	SNalUnitHeaderExt sLastNalHdrExt;
+	SSliceHeader      sLastSliceHeader;
+#endif
+
+}SWelsDecoderContext, *PWelsDecoderContext;
+
+//#pragma pack()
+
+//#ifdef __cplusplus
+//}
+//#endif//__cplusplus
+
+} // namespace WelsDec
+
+#endif//WELS_DECODER_FRAMEWORK_H__
--- /dev/null
+++ b/codec/decoder/core/inc/decoder_core.h
@@ -1,0 +1,138 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  decoder_core.h
+ *
+ *  Abstract
+ *      Encapsulative core interfaces 
+ *
+ *  History
+ *      07/10/2008 Created
+ *
+ *****************************************************************************/
+#ifndef WELS_DECODER_CORE_H__
+#define WELS_DECODER_CORE_H__
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "decoder_context.h"
+
+#include "codec_def.h"
+
+namespace WelsDec {
+
+/*
+ * WelsInitMemory
+ * Memory request for introduced data
+ * Especially for:
+ * rbsp_au_buffer, cur_dq_layer_ptr and ref_dq_layer_ptr in MB info cache.
+ * return:
+ *	0 - success; otherwise returned error_no defined in error_no.h.
+*/
+int32_t WelsInitMemory( PWelsDecoderContext pCtx );
+
+/*
+ * WelsFreeMemory
+ * Free memory introduced in WelsInitMemory at destruction of decoder.
+ * 
+ */
+void_t WelsFreeMemory( PWelsDecoderContext pCtx );
+
+/*!
+ * \brief	request memory when maximal picture width and height are available	
+ */
+int32_t InitialDqLayersContext ( PWelsDecoderContext pCtx, const int32_t kiMaxWidth, const int32_t kiMaxHeight );
+
+/*!
+ * \brief	free dq layer context memory related		
+ */
+void_t UninitialDqLayersContext ( PWelsDecoderContext pCtx );
+
+/*
+ *	DecodeNalHeaderExt
+ *	Trigger condition: NAL_UNIT_TYPE = NAL_UNIT_PREFIX or NAL_UNIT_CODED_SLICE_EXT
+ *	Parameter:
+ *	pNal:	target NALUnit ptr
+ *	pSrc:	NAL Unit bitstream
+ */
+void_t DecodeNalHeaderExt( PNalUnit pNal, uint8_t* pSrc );
+
+/*
+ *	ParseSliceHeaderSyntaxs
+ *	Parse slice header of bitstream
+ */
+int32_t ParseSliceHeaderSyntaxs ( PWelsDecoderContext pCtx, PBitStringAux pBs, const bool_t kbExtensionFlag );
+/*
+ *	Copy relative syntax elements of NALUnitHeaderExt, sRefPicBaseMarking and bStoreRefBasePicFlag in prefix nal unit.
+ *	pSrc:	mark as decoded prefix NAL
+ *	pDst:	succeeded VCL NAL based AVC (I/P Slice)
+ */
+bool_t PrefetchNalHeaderExtSyntax ( PWelsDecoderContext pCtx, PNalUnit const kpDst, PNalUnit const kpSrc);
+
+
+/*
+ * ConstructAccessUnit
+ * construct an access unit for given input bitstream, maybe partial NAL Unit, one or more Units are involved to
+ * joint a collective access unit.
+ * parameter\
+ *	buf:		bitstream data buffer
+ *	bit_len:	size in bit length of data
+ *	buf_len:	size in byte length of data
+ *	coded_au:	mark an Access Unit decoding finished
+ * return:
+ *	0 - success; otherwise returned error_no defined in error_no.h
+ */
+int32_t ConstructAccessUnit( PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo *pDstInfo);
+
+
+/*
+ * DecodeCurrentAccessUnit
+ * Decode current access unit when current AU is completed.
+ */
+int32_t DecodeCurrentAccessUnit( PWelsDecoderContext pCtx, uint8_t **ppDst, int32_t *iDstLen, int32_t *pWidth, int32_t *pHeight, SBufferInfo *pDstInfo );
+
+/*
+ *	Prepare current dq layer context initialization.
+ */
+void_t WelsDqLayerDecodeStart ( PWelsDecoderContext pCtx, PNalUnit pCurNal, PSps pSps, PPps pPps );
+
+
+int32_t WelsDecodeAccessUnitStart ( PWelsDecoderContext pCtx );
+void_t WelsDecodeAccessUnitEnd ( PWelsDecoderContext pCtx );
+
+void_t ForceResetCurrentAccessUnit( PAccessUnit pAu );
+void_t ForceClearCurrentNal( PAccessUnit pAu );
+
+} // namespace WelsDec
+
+#endif//WELS_DECODER_CORE_H__
+
+
--- /dev/null
+++ b/codec/decoder/core/inc/error_code.h
@@ -1,0 +1,171 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	error_code.h
+ *
+ * \brief	Error codes used in Wels decoder side
+ *
+ * \date	3/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_ERROR_CODE_H__
+#define WELS_ERROR_CODE_H__
+
+namespace WelsDec {
+
+typedef enum TagWelsErr
+{
+	ERR_NONE				= 0,
+	ERR_INVALID_PARAMETERS	= 1,
+	ERR_MALLOC_FAILED		= 2,
+	ERR_API_FAILED			= 3,
+	
+	ERR_BOUND				= 31,
+}EWelsErr;
+
+/*
+ * Specified error format:
+ * ERR_NO = (ERR_LEVEL_FROM (HIGH WORD) << 16) | (ERR_INFO_FROM (LOW WORD))
+ *
+ */
+#define GENERATE_ERROR_NO(iErrLevel, iErrInfo)	((iErrLevel << 16) | (iErrInfo & 0xFFFF))
+
+
+/* ERR_LEVEL */
+//-----------------------------------------------------------------------------------------------------------
+enum{
+	ERR_LEVEL_ACCESS_UNIT = 1,
+	ERR_LEVEL_NAL_UNIT_HEADER,
+	ERR_LEVEL_PREFIX_NAL,
+	ERR_LEVEL_PARAM_SETS,
+	ERR_LEVEL_SLICE_HEADER,
+	ERR_LEVEL_SLICE_DATA,
+	ERR_LEVEL_MB_DATA,
+};
+
+//-----------------------------------------------------------------------------------------------------------
+
+/* More detailed error information, maximal value is 65535 */
+//-----------------------------------------------------------------------------------------------------------
+#define ERR_INFO_COMMON_BASE		1
+#define ERR_INFO_SYNTAX_BASE		1001
+#define ERR_INFO_LOGIC_BASE		10001
+enum{
+	/* Error from common system level: 1-1000 */	
+	ERR_INFO_OUT_OF_MEMORY		= ERR_INFO_COMMON_BASE,
+	ERR_INFO_INVALID_ACCESS,
+	ERR_INFO_INVALID_PTR,
+	ERR_INFO_INVALID_PARAM,
+	ERR_INFO_FILE_NO_FOUND,
+	ERR_INFO_PATH_NO_FOUND,
+	ERR_INFO_ACCESS_DENIED,
+	ERR_INFO_NOT_READY,
+	ERR_INFO_WRITE_FAULT,
+	ERR_INFO_READ_FAULT,	
+	/* Error from H.264 syntax elements parser: 1001-10000 */
+	ERR_INFO_NO_PREFIX_CODE		= ERR_INFO_SYNTAX_BASE,	// No start prefix code indication
+	ERR_INFO_NO_PARAM_SETS, 					// No SPS and/ PPS before sequence header
+	ERR_INFO_PARAM_SETS_NOT_INTEGRATED,			// Parameters sets (sps/pps) are not integrated at all before to decode VCL nal
+	ERR_INFO_SPS_ID_OVERFLOW,
+	ERR_INFO_PPS_ID_OVERFLOW,
+	ERR_INFO_INVALID_PROFILE_IDC, 
+	ERR_INFO_UNMATCHED_LEVEL_IDC, 
+	ERR_INFO_INVALID_POC_TYPE,
+	ERR_INFO_REF_COUNT_OVERFLOW,
+	ERR_INFO_CROPPING_NO_SUPPORTED,
+	ERR_INFO_INVALID_SLICEGROUP,
+	ERR_INFO_INVALID_SLICEGROUP_MAP_TYPE,
+	ERR_INFO_INVALID_FRAME_NUM,
+	ERR_INFO_FMO_INIT_FAIL,
+	ERR_INFO_SLICE_TYPE_OVERFLOW,
+	ERR_INFO_INVALID_QP,
+	ERR_INFO_INVALID_DBLOCKING_IDC,
+	ERR_INFO_INVALID_MB_TYPE,
+	ERR_INFO_INVALID_SUB_MB_TYPE,
+	ERR_INFO_UNAVAILABLE_TOP_BLOCK_FOR_INTRA,
+	ERR_INFO_UNAVAILABLE_LEFT_BLOCK_FOR_INTRA,
+	ERR_INFO_INVALID_REF_INDEX,
+	ERR_INFO_INVALID_CBP,
+	ERR_INFO_DQUANT_OUT_OF_RANGE,
+	ERR_INFO_CAVLC_INVALID_PREFIX,
+	ERR_INFO_CAVLC_INVALID_TOTAL_COEFF,
+	ERR_INFO_CAVLC_INVALID_ZERO_LEFT,
+	ERR_INFO_MV_OUT_OF_RANGE,
+
+	ERR_INFO_INVALID_I4x4_PRED_MODE, 
+	ERR_INFO_INVALID_I16x16_PRED_MODE,
+	ERR_INFO_INVALID_I_CHROMA_PRED_MODE,
+
+    ERR_INFO_UNSUPPORTED_NON_BASELINE,
+    ERR_INFO_UNSUPPORTED_FMOTYPE,
+    ERR_INFO_UNSUPPORTED_MBAFF,
+    ERR_INFO_UNSUPPORTED_ILP,
+    ERR_INFO_UNSUPPORTED_CABAC_EL,
+    ERR_INFO_UNSUPPORTED_SPSI,
+    ERR_INFO_UNSUPPORTED_MGS,
+    ERR_INFO_UNSUPPORTED_BIPRED,
+    ERR_INFO_UNSUPPORTED_WP,
+
+    ERR_INFO_FRAMES_LOST,
+	ERR_INFO_DEPENDENCY_SPATIAL_LAYER_LOST,
+	ERR_INFO_DEPENDENCY_QUALIT_LAYER_LOST,
+	ERR_INFO_REFERENCE_PIC_LOST,
+	ERR_INFO_INVALID_REORDERING,
+	ERR_INFO_INVALID_MARKING,
+
+	ERR_INFO_FMO_NOT_SUPPORTED_IN_BASE_LAYER,
+	ERR_INFO_INVALID_ESS,
+	ERR_INFO_INVALID_SLICE_TYPE,
+	ERR_INFO_INVALID_REF_MARKING,
+	ERR_INFO_INVALID_REF_REORDERING,
+	
+	/* Error from corresponding logic, 10001-65535 */
+	ERR_INFO_NO_IDR_PIC		= ERR_INFO_LOGIC_BASE,	// NO IDR picture available before sequence header
+	ERR_INFO_EC_NO_NEIGHBOUR_MBS,
+	ERR_INFO_EC_UNEXPECTED_MB_TYPE,
+	ERR_INFO_EC_NO_ENOUGH_NEIGHBOUR_MBS,
+	//for LTR
+	ERR_INFO_INVALID_MMCO_OPCODE_BASE,
+	ERR_INFO_INVALID_MMCO_SHORT2UNUSED,
+	EER_INFO_INVALID_MMCO_LONG2UNUSED,
+	ERR_INFO_INVALID_MMCO_SHOART2LONG,
+	ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW,
+	ERR_INFO_INVALID_MMCO_REF_NUM_NOT_ENOUGH,
+	ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX,
+};
+//-----------------------------------------------------------------------------------------------------------
+
+} // namespace WelsDec
+
+#endif//WELS_ERROR_CODE_H__
+
+
--- /dev/null
+++ b/codec/decoder/core/inc/expand_pic.h
@@ -1,0 +1,78 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file		expand_pic.h
+ *
+ * \brief		Interface for expanding reconstructed picture to be used for reference
+ *
+ * \date		06/08/2009 Created
+ *************************************************************************************
+ */
+
+#ifndef WELS_EXPAND_PIC_H__
+#define WELS_EXPAND_PIC_H__
+
+#include "decoder_context.h"
+#include "picture.h"
+
+namespace WelsDec {
+
+void_t ExpandReferencingPicture(PPicture pPic, PExpandPictureFunc pExpandPictureLuma, PExpandPictureFunc pExpandPictureChroma[2]);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+void_t ExpandPictureLuma_sse2(	uint8_t *pDst,
+								const int32_t kiStride,
+								const int32_t kiPicWidth,
+								const int32_t kiPicHeight	);
+void_t ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
+									const int32_t kiStride,
+									const int32_t kiPicWidth,
+									const int32_t kiPicHeight	);
+void_t ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
+									const int32_t kiStride,
+									const int32_t kiPicWidth,
+									const int32_t kiPicHeight	);
+#endif//X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+//
+void_t InitExpandPictureFunc( SExpandPicFunc *pExpandPicFunc, const uint32_t kuiCpuFlags );
+
+} // namespace WelsDec
+
+#endif//WELS_EXPAND_PIC_H__
--- /dev/null
+++ b/codec/decoder/core/inc/fmo.h
@@ -1,0 +1,113 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	fmo.h
+ *
+ * \brief	Flexible Macroblock Ordering implementation
+ *
+ * \date	2/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_FLEXIBLE_MACROBLOCK_ORDERING_H__
+#define WELS_FLEXIBLE_MACROBLOCK_ORDERING_H__
+
+#include "typedefs.h"
+#include "wels_const.h"
+#include "parameter_sets.h"
+
+namespace WelsDec {
+
+#ifndef MB_XY_T
+#define MB_XY_T	int16_t
+#endif//MB_XY_T
+
+/*! 
+ * \brief	Wels Flexible Macroblock Ordering (FMO) 
+ */
+typedef struct TagFmo{
+	uint8_t		*pMbAllocMap;
+	int32_t		iCountMbNum;
+	int32_t		iSliceGroupCount;
+	int32_t		iSliceGroupType;	
+	bool_t		bActiveFlag;
+	uint8_t		uiReserved[3];		// reserved padding bytes
+} SFmo, *PFmo;
+
+
+/*!
+ * \brief	Initialize Wels Flexible Macroblock Ordering (FMO)
+ *
+ * \param	pFmo		Wels fmo to be initialized
+ * \param	pPps		PPps
+ * \param	kiMbWidth	mb width
+ * \param	kiMbHeight	mb height
+ *
+ * \return	0 - successful; none 0 - failed;
+ */
+int32_t	InitFmo( PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight );
+
+/*!
+ * \brief	Uninitialize Wels Flexible Macroblock Ordering (FMO) list
+ *
+ * \param	pFmo		Wels base fmo ptr to be uninitialized
+ * \param	kiCnt		count number of PPS per list
+ * \param	kiAvail		count available number of PPS in list
+ *
+ * \return	NONE
+ */
+void_t UninitFmoList( PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail );
+
+/*!
+ * \brief	update/insert FMO parameter unit
+ *
+ * \param	pFmo	FMO context
+ * \param	pSps	PSps
+ * \param	pPps	PPps
+ * \param	pActiveFmoNum	int32_t* [in/out]
+ *
+ * \return	true - update/insert successfully; false - failed;
+ */
+bool_t FmoParamUpdate( PFmo pFmo, PSps pSps, PPps pPps, int32_t *pActiveFmoNum );
+
+/*!
+ * \brief	Get successive mb to be processed with given current mb_xy
+ *
+ * \param	pFmo			Wels fmo context
+ * \param	iMbXy			current mb_xy
+ *
+ * \return	iNextMb - successful; -1 - failed;
+ */
+MB_XY_T FmoNextMb( PFmo pFmo, const MB_XY_T kiMbXy );
+
+} // namespace WelsDec
+
+#endif//WELS_FLEXIBLE_MACROBLOCK_ORDERING_H__
--- /dev/null
+++ b/codec/decoder/core/inc/get_intra_predictor.h
@@ -1,0 +1,121 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	get_intra_predictor.h
+ *
+ * \brief	interfaces for get intra predictor about 16x16, 4x4, chroma.
+ *
+ * \date	4/2/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_GET_INTRA_PREDICTOR_H__
+#define WELS_GET_INTRA_PREDICTOR_H__
+
+#include "typedefs.h"
+
+//#pragma pack(1)
+
+namespace WelsDec {
+
+void_t WelsI4x4LumaPredV_c     (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredH_c     (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDc_c    (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDcTop_c (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDcNA_c  (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDL_c   (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDLTop_c(uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDR_c   (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVL_c    (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVLTop_c (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVR_c    (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredHU_c    (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredHD_c    (uint8_t *pPred, const int32_t kiStride);
+
+void_t WelsIChromaPredV_c      (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredH_c      (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredPlane_c  (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredDc_c     (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcLeft_c (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcTop_c  (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcNA_c   (uint8_t *pPred, const int32_t kiStride);
+
+void_t WelsI16x16LumaPredV_c     (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredH_c     (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredPlane_c (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDc_c    (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcTop_c (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcNA_c  (uint8_t *pPred, const int32_t kiStride);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredH_sse2    (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredV_sse2    (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDc_sse2   (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride);
+void_t WelsI16x16LumaPredDcNA_sse2 (uint8_t *pPred, const int32_t kiStride);
+
+void_t WelsIChromaPredDcTop_sse2   (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredPlane_sse2   (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredDc_sse2      (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredH_mmx        (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredV_mmx        (uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride);
+void_t WelsIChromaPredDcNA_mmx  (uint8_t *pPred, const int32_t kiStride);
+
+void_t WelsI4x4LumaPredH_sse2 (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredHD_mmx (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredHU_mmx (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVR_mmx (uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride);
+void_t WelsI4x4LumaPredVL_mmx (uint8_t *pPred, const int32_t kiStride);
+#endif//X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+} // namespace WelsDec
+
+//#pragma pack()
+
+#endif //WELS_GET_INTRA_PREDICTOR_H__
+
+
--- /dev/null
+++ b/codec/decoder/core/inc/ls_defines.h
@@ -1,0 +1,86 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ___LD_ST_MACROS___
+#define ___LD_ST_MACROS___
+
+#include "typedefs.h"
+
+#ifdef __GNUC__
+
+	struct tagUnaligned_64 { uint64_t l; } __attribute__((packed));
+	struct tagUnaligned_32 { uint32_t l; } __attribute__((packed));
+	struct tagUnaligned_16 { uint16_t l; } __attribute__((packed));
+	
+	#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
+	#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
+	#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
+	//#define _USE_STRUCT_INT_CVT
+//	#ifdef _USE_STRUCT_INT_CVT
+		#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)
+		#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
+		#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)
+//	#else
+//		inline void_t __ST16(void_t *dst, uint16_t v) { memcpy(dst, &v, 2); }
+//		inline void_t __ST32(void_t *dst, uint32_t v) { memcpy(dst, &v, 4); }
+		//inline void_t __ST64(void_t *dst, uint64_t v) { memcpy(dst, &v, 8); }
+//	#endif
+
+#else
+	
+//#define INTD16(a) (*((int16_t*)(a)))
+//#define INTD32(a) (*((int32_t*)(a)))
+//#define INTD64(a) (*((int64_t*)(a)))
+
+#define LD16(a) (*((uint16_t*)(a)))
+#define LD32(a) (*((uint32_t*)(a)))
+#define LD64(a) (*((uint64_t*)(a)))
+
+#define ST16(a, b) *((uint16_t*)(a)) = (b)
+#define ST32(a, b) *((uint32_t*)(a)) = (b)
+#define ST64(a, b) *((uint64_t*)(a)) = (b)
+
+#endif /* !__GNUC__ */
+
+#ifndef INTD16
+#define INTD16	LD16
+#endif//INTD16
+
+#ifndef INTD32
+#define INTD32	LD32
+#endif//INTD32
+
+#ifndef INTD64
+#define INTD64	LD64
+#endif//INTD64
+
+#endif//___LD_ST_MACROS___
--- /dev/null
+++ b/codec/decoder/core/inc/macros.h
@@ -1,0 +1,306 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	macros.h
+ *
+ * \brief	MACRO based tool utilization
+ *
+ * \date	3/13/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_MACRO_UTILIZATIONS_H__
+#define WELS_MACRO_UTILIZATIONS_H__
+
+#include <math.h>
+#include <assert.h>
+#include "typedefs.h"
+
+
+namespace WelsDec {
+
+/*
+* FORCE_STACK_ALIGN_1D: force 1 dimension local data aligned in stack
+* _tp: type
+* _nm: var name
+* _sz: size
+* _al: align bytes
+* auxiliary var: _nm ## _tEmP
+*/
+#define FORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
+	_tp _nm ## _tEmP[(_sz)+(_al)-1]; \
+	_tp *_nm = _nm ## _tEmP + ((_al)-1) - (((int32_t)(_nm ## _tEmP + ((_al)-1)) & ((_al)-1))/sizeof(_tp))
+
+
+#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
+	assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+	_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
+	_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+	_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+	_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
+
+
+///////////// from encoder
+#if defined(_MSC_VER)
+	#define inline	__inline
+    #define __FASTCALL   __fastcall
+//	#define __align8(t,v) __declspec(align(8)) t v
+	#define __align16(t,v) __declspec(align(16)) t v
+#elif defined(__GNUC__)
+#if !defined(MAC_POWERPC) && !defined(UNIX) && !defined(ANDROID_NDK) && !defined(APPLE_IOS)
+    #define __FASTCALL    __attribute__ ((fastcall))// linux, centos, mac_x86 can be used
+#else
+	#define __FASTCALL	// mean NULL for mac_ppc, solaris(sparc/x86)
+#endif//MAC_POWERPC
+//	#define __align8(t,v) t v __attribute__ ((aligned (8)))
+	#define __align16(t,v) t v __attribute__ ((aligned (16)))
+
+#if defined(APPLE_IOS)  
+    #define inline  //For iOS platform
+#endif
+
+#endif//_MSC_VER
+
+
+#if !defined(SIZEOFRGB24)
+#define SIZEOFRGB24(cx, cy)	(3 * (cx) * (cy))
+#endif//SIZEOFRGB24
+
+#if !defined(SIZEOFRGB32)
+#define SIZEOFRGB32(cx, cy)	(4 * (cx) * (cy))
+#endif//SIZEOFRGB32
+#if 1
+#ifndef	WELS_ALIGN
+#define WELS_ALIGN(x, n)	(((x)+(n)-1)&~((n)-1))
+#endif//WELS_ALIGN
+
+#ifndef WELS_MAX
+#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
+#endif//WELS_MAX
+
+#ifndef WELS_MIN
+#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
+#endif//WELS_MIN
+#else
+
+#ifndef	WELS_ALIGN
+#define WELS_ALIGN(x, n)	(((x)+(n)-1)&~((n)-1))
+#endif//WELS_ALIGN
+
+#ifndef WELS_MAX
+#define WELS_MAX(x, y)	((x) - (((x)-(y))&(((x)-(y))>>31)))
+#endif//WELS_MAX
+
+#ifndef WELS_MIN
+#define WELS_MIN(x, y)	((y) + (((x)-(y))&(((x)-(y))>>31)))
+#endif//WELS_MIN
+
+#endif
+
+#ifndef WELS_CEIL
+#define WELS_CEIL(x)	ceil(x)	// FIXME: low complexity instead of math library used
+#endif//WELS_CEIL
+
+#ifndef WELS_FLOOR
+#define WELS_FLOOR(x)	floor(x)	// FIXME: low complexity instead of math library used
+#endif//WELS_FLOOR
+
+#ifndef WELS_ROUND
+#define WELS_ROUND(x)	((int32_t)(0.5f+(x)))
+#endif//WELS_ROUND
+
+#define WELS_NON_ZERO_COUNT_AVERAGE(nC,nA,nB) {		\
+    nC = nA + nB + 1;                      \
+	nC >>= (uint8_t)( nA != -1 && nB != -1);        \
+	nC += (uint8_t)(nA == -1 && nB == -1);           \
+}
+
+static __inline int32_t CeilLog2( int32_t i )
+{
+	int32_t s = 0; i--;
+	while( i > 0 )
+	{
+		s++;
+		i >>= 1;
+	}
+	return s;
+}
+/*
+the second path will degrades the performance
+*/
+#if 1
+static inline int32_t WelsMedian(int32_t iX,  int32_t iY, int32_t iZ)
+{
+	int32_t iMin = iX, iMax = iX;	
+	
+	if ( iY < iMin )
+		iMin	= iY;
+	else
+		iMax = iY;
+
+	if ( iZ < iMin )
+		iMin	= iZ;
+	else if ( iZ > iMax )
+		iMax	= iZ;
+
+	return (iX + iY + iZ) - (iMin + iMax);
+}
+#else
+static inline int32_t WelsMedian(int32_t iX,  int32_t iY, int32_t iZ)
+{
+	int32_t iTmp = (iX-iY)&((iX-iY)>>31);
+	iX -= iTmp;
+	iY += iTmp;
+	iY -= (iY-iZ)&((iY-iZ)>>31);
+	iY += (iX-iY)&((iX-iY)>>31);
+	return iY;
+}
+
+#endif
+
+#ifndef NEG_NUM
+//#define NEG_NUM( num ) (-num)
+#define NEG_NUM(iX) (1+(~(iX)))
+#endif// NEG_NUM
+
+#ifndef WELS_CLIP1
+//#define WELS_CLIP1(x) (x & ~255) ? (-x >> 31) : x
+#define WELS_CLIP1(iX) (((iX) & ~255) ? (-(iX) >> 31) : (iX)) //iX not only a value but also can be an expression
+#endif//WELS_CLIP1
+
+
+#ifndef WELS_SIGN
+#define WELS_SIGN(iX) ((int32_t)(iX) >> 31)
+#endif //WELS_SIGN
+#ifndef WELS_ABS
+#define WELS_ABS(iX) ((WELS_SIGN(iX) ^ (int32_t)(iX)) - WELS_SIGN(iX))
+#endif //WELS_ABS
+
+// WELS_CLIP3
+#ifndef WELS_CLIP3
+#define WELS_CLIP3(iX, iY, iZ) ((iX) < (iY) ? (iY) : ((iX) > (iZ) ? (iZ) : (iX)))
+#endif //WELS_CLIP3
+
+/*
+ * Description: to check variable validation and return the specified result
+ *	iResult:	value to be return
+ *	bCaseIf:	negative condition to be verified
+ */
+#ifndef WELS_VERIFY_RETURN_IF
+#define WELS_VERIFY_RETURN_IF(iResult, bCaseIf) \
+	if ( bCaseIf ){ \
+		return iResult; \
+	}
+#endif//#if WELS_VERIFY_RETURN_IF
+
+/*
+ *	Description: to check variable validation and return the specified result 
+ *		with correspoinding process advance.
+ *	 result:	value to be return
+ *	 case_if:	negative condition to be verified
+ *	 proc:		process need perform
+ */
+#ifndef WELS_VERIFY_RETURN_PROC_IF
+#define WELS_VERIFY_RETURN_PROC_IF(iResult, bCaseIf, fProc) \
+	if ( bCaseIf ){ \
+		fProc;	\
+		return iResult;	\
+	}
+#endif//#if WELS_VERIFY_RETURN_PROC_IF
+
+/*
+ * Description:	to check variable validation and return
+ *	case_if:	negtive condition to be verified
+ *	return:		NONE
+ */
+#ifndef WELS_VERIFY_IF
+#define WELS_VERIFY_IF(bCaseIf) \
+	if ( bCaseIf ){ \
+		return; \
+	}
+#endif//#if WELS_VERIFY_IF
+
+/*
+ * Description:	to check variable validation and return with correspoinding process advance.
+ *	case_if:	negtive condition to be verified
+ *	proc:		process need preform
+ *	return:		NONE
+ */
+#ifndef WELS_VERIFY_PROC_IF
+#define WELS_VERIFY_PROC_IF(bCaseIf, fProc) \
+	if ( bCaseIf ){ \
+		fProc; \
+		return; \
+	}
+#endif//#if WELS_VERIFY_IF
+
+/*
+ * Description: to safe free a ptr with free function pointer
+ *  p:			pointer to be destroyed
+ *	free_fn:	free function pointer used
+ */
+#ifndef WELS_SAFE_FREE_P
+#define WELS_SAFE_FREE_P(pPtr, fFreeFunc) \
+	do{ \
+		if ( NULL != (pPtr) ){ \
+			fFreeFunc( (pPtr) ); \
+			(pPtr) = NULL; \
+		} \
+	}while( 0 );
+#endif//#if WELS_SAFE_FREE_P
+
+/*
+ * Description: to safe free an array ptr with free function pointer
+ *	arr:		pointer to an array, something like "**p";
+ *	num:		number of elements in array
+ *  free_fn:	free function pointer	
+ */
+#ifndef WELS_SAFE_FREE_ARR
+#define WELS_SAFE_FREE_ARR(pArray, iNum, fFreeFunc) \
+	do{ \
+		if ( NULL != (pArray) ){ \
+			int32_t iIdx = 0; \
+			while( iIdx < iNum ){ \
+				if ( NULL != (pArray)[iIdx] ){ \
+					fFreeFunc( (pArray)[iIdx] ); \
+					(pArray)[iIdx] = NULL; \
+				} \
+				++ iIdx; \
+			} \
+			fFreeFunc((pArray)); \
+			(pArray) = NULL; \
+		} \
+	}while( 0 );
+#endif//#if WELS_SAFE_FREE_ARR
+
+} // namespace WelsDec
+
+#endif//WELS_MACRO_UTILIZATIONS_H__
--- /dev/null
+++ b/codec/decoder/core/inc/manage_dec_ref.h
@@ -1,0 +1,81 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  \file	manage_dec_ref.h
+ *
+ *  Abstract
+ *      Interface for managing reference picture
+ *
+ *  History
+ *      08/14/2009 Created
+ *
+ *****************************************************************************/
+#ifndef WELS_MANAGE_DEC_REF_H__
+#define WELS_MANAGE_DEC_REF_H__
+
+
+#include "typedefs.h"
+#include "decoder_context.h"
+
+namespace WelsDec {
+
+typedef enum TagRemoveFlag{
+	REMOVE_TARGET = 0,
+	REMOVE_BASE = 1,	
+	REMOVE_BASE_FIRST = 2
+}ERemoveFlag;
+
+void_t  WelsResetRefPic   (PWelsDecoderContext pCtx);
+int32_t WelsInitRefList   (PWelsDecoderContext pCtx, int32_t iPoc);
+int32_t WelsReorderRefList(PWelsDecoderContext pCtx);
+int32_t WelsMarkAsRef     (PWelsDecoderContext pCtx, const bool_t kbRefBaseMarkingFlag);
+
+static PPicture WelsDelShortFromList        (PRefPic pRefPic, int32_t iFrameNum,           ERemoveFlag eRemoveFlag);
+static PPicture WelsDelLongFromList         (PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag);
+static PPicture WelsDelShortFromListSetUnref(PRefPic pRefPic, int32_t iFrameNum,           ERemoveFlag eRemoveFlag);
+static PPicture WelsDelLongFromListSetUnref (PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag);
+
+static int32_t MMCOBase     (PWelsDecoderContext pCtx, PRefBasePicMarking pRefPicBaseMarking);
+static int32_t MMCO         (PWelsDecoderContext pCtx, PRefPicMarking pRefPicMarking);
+static int32_t MMCOProcess  (PWelsDecoderContext pCtx, uint32_t uiMmcoType, bool_t bRefBasePic,
+                               int32_t iShortFrameNum, uint32_t uiLongTermPicNum, int32_t iLongTermFrameIdx, int32_t iMaxLongTermFrameIdx);
+static int32_t SlidingWindow(PWelsDecoderContext pCtx);
+
+static int32_t AddShortTermToList(PRefPic pRefPic, PPicture pPic);
+static int32_t AddLongTermToList (PRefPic pRefPic, PPicture pPic, int32_t iLongTermFrameIdx);
+static int32_t AssignLongTermIdx (PRefPic pRefPic, int32_t iFrameNum, int32_t iLongTermFrameIdx);
+static int32_t MarkAsLongTerm    (PRefPic pRefPic, int32_t iFrameNum, int32_t iLongTermFrameIdx);
+
+} // namespace WelsDec
+
+#endif//WELS_MANAGE_DEC_REF_H__
+
+
--- /dev/null
+++ b/codec/decoder/core/inc/mb_cache.h
@@ -1,0 +1,84 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//mb_cache.h
+#ifndef WELS_MACROBLOCK_CACHE_H__
+#define WELS_MACROBLOCK_CACHE_H__
+
+#include "typedefs.h"
+
+namespace WelsDec {
+
+//#pragma pack(1)
+
+#define REF_NOT_AVAIL    -2
+#define REF_NOT_IN_LIST  -1  //intra
+
+/*
+ *	MB Cache information, such one cache should be defined within a slice
+ */
+/*
+ * Cache for Luma				Cache for Chroma(Cb, Cr)
+ *	
+ *	TL T T T T					TL T T
+ *	 L - - - -					 L - -
+ *	 L - - - -					 L - - TR
+ *	 L - - - -
+ *   L - - - - TR
+ *
+ */
+
+////////////////////////mapping scan index////////////////////////
+
+// for data sharing cross modules and try to reduce size of binary generated
+extern const uint8_t g_kuiMbNonZeroCountIdx[24];
+extern const uint8_t g_kuiCache30ScanIdx[16];
+extern const uint8_t g_kuiCacheNzcScanIdx[24];
+
+extern const uint8_t g_kuiScan4[16];
+
+typedef struct TagNeighborAvail
+{
+	int32_t iTopAvail;
+	int32_t iLeftAvail;
+	int32_t iRightTopAvail;
+	int32_t iLeftTopAvail;  //used for check intra_pred_mode avail or not   //1: avail; 0: unavail
+
+	int32_t iLeftType;
+	int32_t iTopType;
+	int32_t iLeftTopType;
+	int32_t iRightTopType; 
+}SNeighAvail, *PNeighAvail;
+
+} // namespace WelsDec
+
+#endif//WELS_MACROBLOCK_CACHE_H__
--- /dev/null
+++ b/codec/decoder/core/inc/mc.h
@@ -1,0 +1,79 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_MC_H__
+#define WELS_MC_H__
+
+#include "wels_const.h"
+#include "macros.h"
+#include "decoder_context.h"
+
+namespace WelsDec {
+
+void_t InitMcFunc(SMcFunc *pMcFunc, int32_t iCpu);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+//***************************************************************************//
+//                       MMXEXT definition                          //
+//***************************************************************************//
+#if defined(X86_ASM)
+typedef void_t (*PMcChromaWidthExtFunc)( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD, int32_t iHeight );
+extern void_t McHorVer20WidthEq4_mmx (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+extern void_t McChromaWidthEq4_mmx   (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD, int32_t iHeight );
+extern void_t McCopyWidthEq4_mmx     (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+extern void_t McCopyWidthEq8_mmx     (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+extern void_t PixelAvgWidthEq4_mmx   (uint8_t *pDst, int32_t iDstStride, uint8_t *pSrcA, int32_t iSrcAStride, uint8_t *pSrcB, int32_t iSrcBStride, int32_t iHeight);
+extern void_t PixelAvgWidthEq8_mmx   (uint8_t *pDst, int32_t iDstStride, uint8_t *pSrcA, int32_t iSrcAStride, uint8_t *pSrcB, int32_t iSrcBStride, int32_t iHeight);
+//***************************************************************************//
+//                       SSE2 definition                          //
+//***************************************************************************//
+extern void_t McChromaWidthEq8_sse2   (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t* kpABCD, int32_t iHeight );
+extern void_t McCopyWidthEq16_sse2    (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+extern void_t McHorVer20WidthEq8_sse2 (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+extern void_t McHorVer20WidthEq16_sse2(uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+extern void_t McHorVer02WidthEq8_sse2 (uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+extern void_t McHorVer22Width8HorFirst_sse2(uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+extern void_t McHorVer22VerLast_sse2(uint8_t * pTap, int32_t iTapStride, uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight);
+extern void_t PixelAvgWidthEq16_sse2  (uint8_t *pDst, int32_t iDstStride, uint8_t *pSrcA, int32_t iSrcAStride, uint8_t *pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+#endif //X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+} // namespace WelsDec
+
+#endif//WELS_MC_H__
--- /dev/null
+++ b/codec/decoder/core/inc/measure_time.h
@@ -1,0 +1,100 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	measure_time.h
+ *
+ * \brief	time cost measure utilization
+ *
+ * \date	04/28/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_TIME_COST_MEASURE_UTIL_H__
+#define WELS_TIME_COST_MEASURE_UTIL_H__
+
+#include <stdlib.h>
+
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#include <sys/time.h>
+#else
+#include "typedefs.h"
+//#include <sys/types.h>
+#include <sys/timeb.h>
+#endif
+#include <time.h>
+#if defined(WIN32)
+#include <windows.h>
+#endif//#if WIN32
+
+#ifdef __cplusplus
+extern "C" {
+#endif//__cplusplus
+
+/*!
+ * \brief	time cost measure utilization
+ * \param	void_t
+ * \return	time elapsed since run (unit: microsecond)
+ */
+
+int64_t WelsTime( void_t )
+{
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+	struct timeval tv_date;
+	
+	gettimeofday( &tv_date, NULL );
+	return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
+#else
+#if defined (WIN32)
+	static int64_t iMtimeFreq = 0;
+	int64_t iMtimeCur = 0;
+	int64_t iResult = 0;
+	if ( !iMtimeFreq ){
+		QueryPerformanceFrequency((LARGE_INTEGER *)&iMtimeFreq);
+		if ( !iMtimeFreq )
+			iMtimeFreq = 1;
+	}
+	QueryPerformanceCounter((LARGE_INTEGER *)&iMtimeCur);
+	iResult = (int64_t)((double)iMtimeCur * 1e6 / (double)iMtimeFreq + 0.5);
+	return iResult;
+#else
+	struct _timeb sTime;
+	
+	_ftime(&sTime);
+	return ((int64_t)sTime.time * (1000) + (int64_t)sTime.millitm) * (1000);
+#endif//#if WIN32
+#endif//!(defined(_MSC_VER) || defined(__MINGW32__))
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//WELS_TIME_COST_MEASURE_UTIL_H__
--- /dev/null
+++ b/codec/decoder/core/inc/mem_align.h
@@ -1,0 +1,92 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	memory alignment utilization
+ */
+
+#ifndef WELS_MEM_ALIGN_H__
+#define WELS_MEM_ALIGN_H__
+
+
+
+#include <stdlib.h>
+#include <string.h>
+#include "utils.h"
+
+namespace WelsDec {
+
+//#define CACHE_LINE 64
+
+#ifdef __cplusplus
+extern "C" {
+#endif//__cplusplus
+
+
+
+/*! 
+*************************************************************************************
+* \brief	malloc with zero filled utilization in Wels
+*
+* \param 	kuiSize	    size of memory block required
+*
+* \return	allocated memory pointer exactly, failed in case of NULL return
+*
+* \note	N/A
+*************************************************************************************
+*/
+void_t * WelsMalloc( const uint32_t kuiSize, const str_t *kpTag );
+
+/*! 
+*************************************************************************************
+* \brief	free utilization in Wels
+*
+* \param 	pPtr	data pointer to be free. 
+*			i.e, uint8_t *pPtr = actual data to be free, argv = &pPtr.
+*
+* \return	NONE
+*
+* \note	N/A
+*************************************************************************************
+*/
+void_t WelsFree( void_t * pPtr, const str_t *kpTag );
+
+#define WELS_SAFE_FREE(pPtr, pTag)		if (pPtr) { WelsFree(pPtr, pTag); pPtr = NULL; }
+
+/*
+ *	memory operation routines
+ */
+
+#ifdef __cplusplus
+}
+#endif//__cplusplus
+
+} // namespace WelsDec
+
+#endif //WELS_MEM_ALIGN_H__
--- /dev/null
+++ b/codec/decoder/core/inc/memmgr_nal_unit.h
@@ -1,0 +1,65 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  memmgr_nal_unit.h
+ *
+ *  Abstract
+ *      memory manager utils for NAL Unit list available
+ *
+ *  History
+ *      07/10/2008 Created
+ *
+ *****************************************************************************/
+#ifndef WELS_MEMORY_MANAGER_NAL_UNIT_H__
+#define WELS_MEMORY_MANAGER_NAL_UNIT_H__
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "nalu.h"
+
+namespace WelsDec {
+
+int32_t MemInitNalList(PAccessUnit *ppAu, const uint32_t kuiSize);
+
+int32_t MemFreeNalList(PAccessUnit *ppAu);
+
+/*
+ *	MemGetNextNal
+ *	Get next NAL Unit for using.
+ *	Need expand NAL Unit list if exceeding count number of available NAL Units withing an Access Unit
+ */
+PNalUnit MemGetNextNal(PAccessUnit *ppAu);
+
+} // namespace WelsDec
+
+#endif//WELS_MEMORY_MANAGER_NAL_UNIT_H__
+
+
--- /dev/null
+++ b/codec/decoder/core/inc/mv_pred.h
@@ -1,0 +1,98 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mv_pred.h
+ *
+ * \brief	Get MV predictor and update motion vector of mb cache
+ *
+ * \date	05/22/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_MV_PRED_H__
+#define WELS_MV_PRED_H__
+
+#include "dec_frame.h"
+
+namespace WelsDec {
+
+/*!
+* \brief	 update mv and ref_index cache for current MB, only for P_16x16 (SKIP inclusive)
+* \param	 
+* \param	 
+*/
+void_t UpdateP16x16MotionInfo(PDqLayer pCurDqLayer, int8_t iRef, int16_t iMVs[2]);
+
+ /*!
+ * \brief   update mv and ref_index cache for current MB, only for P_16x8
+ * \param 	
+ * \param 	
+ */
+void_t UpdateP16x8MotionInfo(PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+							  int32_t iPartIdx, int8_t iRef, int16_t iMVs[2]);
+
+
+ /*!
+  * \brief	 update mv and ref_index cache for current MB, only for P_8x16
+  * \param	 
+  * \param	 
+  */
+void_t UpdateP8x16MotionInfo(PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+							  int32_t iPartIdx, int8_t iRef, int16_t iMVs[2]);
+ 
+/*!
+ * \brief   get the motion predictor for 4*4 or 8*8 or 16*16 block
+ * \param 	
+ * \param 	output mvp_x and mvp_y
+ */
+void_t PredMv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+			 int32_t iPartIdx, int32_t iPartWidth, int8_t iRef, int16_t iMVP[2]);
+
+/*!
+ * \brief   get the motion predictor for inter16x8 MB
+ * \param 	
+ * \param 	output mvp_x and mvp_y
+ */
+void_t PredInter16x8Mv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+						int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]);
+
+/*!
+ * \brief   get the motion predictor for inter8x16 MB
+ * \param 	
+ * \param 	output mvp_x and mvp_y
+ */
+void_t PredInter8x16Mv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+						int32_t iPartIdx, int8_t iRef, int16_t iMVP[2]);
+
+} // namespace WelsDec
+
+#endif//WELS_MV_PRED_H__
--- /dev/null
+++ b/codec/decoder/core/inc/nal_prefix.h
@@ -1,0 +1,89 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//nal_prefix.h	-	definitions for NAL Unit Header(/Ext) and PrefixNALUnit
+#ifndef WELS_NAL_UNIT_PREFIX_H__
+#define WELS_NAL_UNIT_PREFIX_H__
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "slice.h"
+
+namespace WelsDec {
+
+//#pragma pack(1)
+
+///////////////////////////////////NAL Unit prefix/headers///////////////////////////////////
+
+/* NAL Unix Header in AVC, refer to Page 56 in JVT X201wcm */
+typedef struct TagNalUnitHeader{
+	uint8_t		    uiForbiddenZeroBit;
+	uint8_t		    uiNalRefIdc;
+	ENalUnitType    eNalUnitType;
+	uint8_t		    uiReservedOneByte;		// only padding usage
+}SNalUnitHeader, *PNalUnitHeader;
+
+/* NAL Unit Header in scalable extension syntax, refer to Page 390 in JVT X201wcm */
+typedef struct TagNalUnitHeaderExt{
+	SNalUnitHeader	sNalUnitHeader;
+	
+//	uint8_t		reserved_one_bit;
+	bool_t		bIdrFlag;
+	uint8_t		uiPriorityId;
+	int8_t		iNoInterLayerPredFlag;	// change as int8_t to support 3 values probably in encoder	
+	uint8_t		uiDependencyId;
+
+	uint8_t		uiQualityId;
+	uint8_t		uiTemporalId;
+	bool_t		bUseRefBasePicFlag;
+	bool_t		bDiscardableFlag;
+	
+	bool_t		bOutputFlag;
+	uint8_t		uiReservedThree2Bits;
+	// Derived variable(s)
+	uint8_t		uiLayerDqId;
+	bool_t		bNalExtFlag;
+}SNalUnitHeaderExt, *PNalUnitHeaderExt;
+
+/* Prefix NAL Unix syntax, refer to Page 392 in JVT X201wcm */
+typedef struct TagPrefixNalUnit{
+	SRefBasePicMarking	sRefPicBaseMarking;	
+	bool_t		bStoreRefBasePicFlag;		
+	bool_t		bPrefixNalUnitAdditionalExtFlag;
+	bool_t		bPrefixNalUnitExtFlag;
+}SPrefixNalUnit, *PPrefixNalUnit;
+
+//#pragma pack()
+
+} // namespace WelsDec
+
+#endif//WELS_NAL_UNIT_PREFIX_H__
--- /dev/null
+++ b/codec/decoder/core/inc/nalu.h
@@ -1,0 +1,79 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//nalu.h:	NAL Unit definition
+#ifndef WELS_NAL_UNIT_H__
+#define WELS_NAL_UNIT_H__
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "nal_prefix.h"
+#include "bit_stream.h"
+
+namespace WelsDec {
+
+///////////////////////////////////NAL UNIT level///////////////////////////////////
+
+/* NAL Unit Structure */
+typedef struct TagNalUnit{
+	SNalUnitHeaderExt	sNalHeaderExt;
+	
+	union{
+		struct SVclNal{
+			SSliceHeaderExt	sSliceHeaderExt;
+			SBitStringAux	sSliceBitsRead;
+			uint8_t 		*pNalPos;	  // save the address of slice nal for GPU function
+			int32_t 		iNalLength;   // save the nal length for GPU function
+			bool_t			bSliceHeaderExtFlag;
+		} sVclNal;
+		SPrefixNalUnit	sPrefixNal;
+	} sNalData;		
+	
+}SNalUnit, *PNalUnit;
+
+///////////////////////////////////ACCESS Unit level///////////////////////////////////
+
+/* Access Unit structure */
+typedef struct TagAccessUnits{
+	PNalUnit		*pNalUnitsList;	// list of NAL Units pointer in this AU
+	uint32_t		uiAvailUnitsNum;	// Number of NAL Units available in each AU list based current bitstream,
+	uint32_t		uiActualUnitsNum;	// actual number of NAL units belong to current au
+	// While available number exceeds count size below, need realloc extra NAL Units for list space.
+	uint32_t		uiCountUnitsNum;	// Count size number of malloced NAL Units in each AU list
+	uint32_t		uiStartPos;
+	uint32_t		uiEndPos;
+	bool_t			bCompletedAuFlag;	// Indicate whether it is a completed AU
+}SAccessUnit, *PAccessUnit;
+
+} // namespace WelsDec
+
+#endif//WELS_NAL_UNIT_H__
--- /dev/null
+++ b/codec/decoder/core/inc/parameter_sets.h
@@ -1,0 +1,173 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_PARAMETER_SETS_H__
+#define WELS_PARAMETER_SETS_H__
+
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+
+namespace WelsDec {
+
+//#pragma pack(1)
+
+/* Sequence Parameter Set, refer to Page 57 in JVT X201wcm */
+typedef struct TagSps{
+	int32_t	    iSpsId;
+	uint32_t	iMbWidth;
+	uint32_t	iMbHeight;
+	uint32_t	uiTotalMbCount;	//used in decode_slice_data()
+	
+	uint32_t	uiLog2MaxFrameNum;
+	uint32_t	uiPocType;
+	/* POC type 0 */
+	int32_t		iLog2MaxPocLsb;
+	/* POC type 1 */
+	int32_t		iOffsetForNonRefPic;
+
+	int32_t		iOffsetForTopToBottomField;
+	int32_t		iNumRefFramesInPocCycle;
+	int8_t		iOffsetForRefFrame[256];
+	int32_t		iNumRefFrames;
+	
+	SPosOffset	sFrameCrop;
+	
+	ProfileIdc	uiProfileIdc;
+	uint8_t		uiLevelIdc;
+	uint8_t		uiChromaFormatIdc;
+	uint8_t		uiChromaArrayType;
+	
+	uint8_t		uiBitDepthLuma;
+	uint8_t		uiBitDepthChroma;
+	/* TO BE CONTINUE: POC type 1 */
+	bool_t		bDeltaPicOrderAlwaysZeroFlag;	
+	bool_t		bGapsInFrameNumValueAllowedFlag;
+
+	bool_t		bFrameMbsOnlyFlag;
+	bool_t		bMbaffFlag;	// MB Adapative Frame Field
+	bool_t		bDirect8x8InferenceFlag;
+	bool_t		bFrameCroppingFlag;
+
+	bool_t		bVuiParamPresentFlag;
+//	bool_t		bTimingInfoPresentFlag;
+//	bool_t		bFixedFrameRateFlag;
+	bool_t		bConstraintSet0Flag;
+	bool_t		bConstraintSet1Flag;
+	bool_t		bConstraintSet2Flag;
+	bool_t		bConstraintSet3Flag;
+	bool_t		bSeparateColorPlaneFlag;
+	bool_t		bQpPrimeYZeroTransfBypassFlag;
+	bool_t		bSeqScalingMatrixPresentFlag;
+	bool_t		bSeqScalingListPresentFlag[12];	
+}SSps, *PSps;
+
+
+/* Sequence Parameter Set extension syntax, refer to Page 58 in JVT X201wcm */
+//typedef struct TagSpsExt{
+//	uint32_t	iSpsId;
+//	uint32_t	uiAuxFormatIdc;
+//	int32_t		iAlphaOpaqueValue;
+//	int32_t		iAlphaTransparentValue;
+	
+//	uint8_t		uiBitDepthAux;
+//	bool_t		bAlphaIncrFlag;
+//	bool_t		bAdditionalExtFlag;
+//}SSpsExt, *PSpsExt;
+
+/* Sequence Parameter Set extension syntax, refer to Page 391 in JVT X201wcm */
+typedef struct TagSpsSvcExt{
+	SPosOffset	sSeqScaledRefLayer;
+	
+	uint8_t		uiExtendedSpatialScalability;	// ESS
+	uint8_t		uiChromaPhaseXPlus1Flag;
+	uint8_t		uiChromaPhaseYPlus1;
+	uint8_t		uiSeqRefLayerChromaPhaseXPlus1Flag;
+	uint8_t		uiSeqRefLayerChromaPhaseYPlus1;
+	bool_t		bInterLayerDeblockingFilterCtrlPresentFlag;
+	bool_t		bSeqTCoeffLevelPredFlag;
+	bool_t		bAdaptiveTCoeffLevelPredFlag;
+	bool_t		bSliceHeaderRestrictionFlag;	
+}SSpsSvcExt, *PSpsSvcExt;
+
+/* Subset sequence parameter set syntax, refer to Page 391 in JVT X201wcm */
+typedef struct TagSubsetSps{	
+	SSps		sSps;
+	SSpsSvcExt	sSpsSvcExt;
+	bool_t		bSvcVuiParamPresentFlag;	
+	bool_t		bAdditionalExtension2Flag;
+	bool_t		bAdditionalExtension2DataFlag;
+}SSubsetSps, *PSubsetSps;
+
+/* Picture parameter set syntax, refer to Page 59 in JVT X201wcm */
+typedef struct TagPps{
+	int32_t	iSpsId;
+	int32_t	iPpsId;
+	
+	uint32_t	uiNumSliceGroups;
+	uint32_t	uiSliceGroupMapType;
+	/* slice_group_map_type = 0 */
+	uint32_t	uiRunLength[MAX_SLICEGROUP_IDS];
+	/* slice_group_map_type = 2 */
+	uint32_t	uiTopLeft[MAX_SLICEGROUP_IDS];
+	uint32_t	uiBottomRight[MAX_SLICEGROUP_IDS];
+	/* slice_group_map_type = 3, 4 or 5 */
+	uint32_t	uiSliceGroupChangeRate;
+	/* slice_group_map_type = 6 */
+	uint32_t	uiPicSizeInMapUnits;
+	uint32_t	uiSliceGroupId[MAX_SLICEGROUP_IDS];
+	
+	uint32_t	uiNumRefIdxL0Active;
+	uint32_t	uiNumRefIdxL1Active;
+	
+	int32_t		iPicInitQp;
+	int32_t		iPicInitQs;
+	int32_t		iChromaQpIndexOffset;	
+
+	bool_t		bEntropyCodingModeFlag;
+	bool_t		bPicOrderPresentFlag;
+	/* slice_group_map_type = 3, 4 or 5 */
+	bool_t		bSliceGroupChangeDirectionFlag;
+	bool_t		bDeblockingFilterControlPresentFlag;
+	
+	bool_t		bConstainedIntraPredFlag;
+	bool_t		bRedundantPicCntPresentFlag;
+	bool_t		bWeightedPredFlag;
+	uint8_t		uiWeightedBipredIdc;
+	
+} SPps, *PPps;
+
+//#pragma pack()
+
+} // namespace WelsDec
+
+#endif //WELS_PARAMETER_SETS_H__
--- /dev/null
+++ b/codec/decoder/core/inc/parse_mb_syn_cavlc.h
@@ -1,0 +1,213 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	parse_mb_syn_cavlc.h
+ *
+ * \brief	Parsing all syntax elements of mb and decoding residual with cavlc
+ *
+ * \date	03/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+ 
+#ifndef WELS_PARSE_MB_SYN_CAVLC_H__
+#define WELS_PARSE_MB_SYN_CAVLC_H__
+
+#include "wels_common_basis.h"
+#include "decoder_context.h"
+#include "dec_frame.h"
+#include "slice.h"
+
+namespace WelsDec {
+
+#define I16_LUMA_DC  1
+#define I16_LUMA_AC  2 
+#define LUMA_DC_AC   3
+#define CHROMA_DC    4
+#define CHROMA_AC    5
+
+typedef struct TagReadBitsCache
+{
+    uint32_t uiCache32Bit;
+    uint8_t  uiRemainBits;
+    uint8_t  *pBuf;
+}SReadBitsCache;
+
+#define SHIFT_BUFFER(pBitsCache)	{	pBitsCache->pBuf+=2; pBitsCache->uiRemainBits += 16; pBitsCache->uiCache32Bit |= (((pBitsCache->pBuf[2] << 8) | pBitsCache->pBuf[3]) << (32 - pBitsCache->uiRemainBits));	}
+#define POP_BUFFER(pBitsCache, iCount)	{ pBitsCache->uiCache32Bit <<= iCount;	pBitsCache->uiRemainBits -= iCount;	}
+
+static const uint8_t g_kuiZigzagScan[16]={//4*4block residual zig-zag scan order
+	0,  1,  4,  8,	
+	5,  2,  3,  6,		
+	9, 12, 13, 10,	
+	7, 11, 14, 15,	
+};
+
+
+typedef struct TagI16PredInfo{
+	int8_t iPredMode;
+	int8_t iLeftAvail;
+	int8_t iTopAvail;
+	int8_t iLeftTopAvail;
+} SI16PredInfo;
+static const SI16PredInfo g_ksI16PredInfo[4] = {
+	{I16_PRED_V, 0, 1, 0},
+	{I16_PRED_H, 1, 0, 0},
+	{         0, 0, 0, 0},
+	{I16_PRED_P, 1, 1, 1},
+};
+
+static const SI16PredInfo g_ksChromaPredInfo[4] = {
+	{       0, 0, 0, 0},
+	{C_PRED_H, 1, 0, 0},
+	{C_PRED_V, 0, 1, 0},
+	{C_PRED_P, 1, 1, 1},
+};
+
+
+typedef struct TagI4PredInfo {
+	int8_t iPredMode;
+	int8_t iLeftAvail;
+	int8_t iTopAvail;
+	int8_t iLeftTopAvail;
+//	int8_t right_top_avail; //when right_top unavailable but top avail, we can pad the right-top with the rightmost pixel of top
+} SI4PredInfo;
+static const SI4PredInfo g_ksI4PredInfo[9] = {
+	{  I4_PRED_V, 0, 1, 0},
+	{  I4_PRED_H, 1, 0, 0},
+	{          0, 0, 0, 0},
+	{I4_PRED_DDL, 0, 1, 0},
+	{I4_PRED_DDR, 1, 1, 1},
+	{ I4_PRED_VR, 1, 1, 1},
+	{ I4_PRED_HD, 1, 1, 1},
+	{ I4_PRED_VL, 0, 1, 0},
+	{ I4_PRED_HU, 1, 0, 0},
+};
+
+static const uint8_t g_kuiI16CbpTable[6] = {0, 16, 32, 15, 31, 47}; //reference to JM
+
+
+typedef struct TagPartMbInfo{
+    MbType iType;
+    int8_t iPartCount; //P_16*16, P_16*8, P_8*16, P_8*8 based on 8*8 block; P_8*4, P_4*8, P_4*4 based on 4*4 block
+	int8_t iPartWidth; //based on 4*4 block
+} SPartMbInfo; 
+static const SPartMbInfo g_ksInterMbTypeInfo[5]={
+{MB_TYPE_16x16,    1, 4},
+{MB_TYPE_16x8,     2, 4},
+{MB_TYPE_8x16,     2, 2},
+{MB_TYPE_8x8,      4, 4},
+{MB_TYPE_8x8_REF0, 4, 4}, //ref0--ref_idx not present in bit-stream and default as 0
+};
+static const SPartMbInfo g_ksInterSubMbTypeInfo[4]={
+{SUB_MB_TYPE_8x8, 1, 2},
+{SUB_MB_TYPE_8x4, 2, 2},
+{SUB_MB_TYPE_4x8, 2, 1},
+{SUB_MB_TYPE_4x4, 4, 1},
+};
+
+void_t GetNeighborAvailMbType         (PNeighAvail pNeighAvail, PDqLayer pCurLayer);
+void_t WelsFillCacheNonZeroCount      (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, PDqLayer pCurLayer);
+void_t WelsFillCacheConstrain0Intra4x4(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer);
+void_t WelsFillCacheConstrain1Intra4x4(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer);
+void_t WelsFillCacheInter             (PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, 
+						              int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PDqLayer pCurLayer);
+
+void_t PredPSkipMvFromNeighbor       (PDqLayer pCurLayer, int16_t iMvp[2]);
+
+/*!
+ * \brief   check iPredMode for intra16x16 eligible or not
+ * \param 	input : current iPredMode
+ * \param 	output: 0 indicating decoding correctly; -1 means error occurence
+ */
+ int32_t CheckIntra16x16PredMode(uint8_t uiSampleAvail, int8_t* pMode);
+
+/*!
+ * \brief   check iPredMode for intra4x4 eligible or not
+ * \param 	input : current iPredMode
+ * \param 	output: 0 indicating decoding correctly; -1 means error occurence
+ */
+ int32_t CheckIntra4x4PredMode(int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex);
+
+/*!
+ * \brief   check iPredMode for chroma eligible or not
+ * \param 	input : current iPredMode
+ * \param 	output: 0 indicating decoding correctly; -1 means error occurence
+ */
+ int32_t CheckIntraChromaPredMode(uint8_t uiSampleAvail, int8_t* pMode);
+
+/*!
+ * \brief   predict the mode of intra4x4
+ * \param 	input : current intra4x4 block index
+ * \param 	output: mode index
+ */
+int32_t PredIntra4x4Mode(int8_t* pIntraPredMode, int32_t iIdx4);
+
+
+void_t BsStartCavlc( PBitStringAux pBs );
+void_t BsEndCavlc( PBitStringAux pBs );
+
+int32_t WelsResidualBlockCavlc(	SVlcTable* pVlcTable,
+										uint8_t* pNonZeroCountCache,
+										PBitStringAux pBs,
+										/*int16_t* coeff_level,*/
+										int32_t iIndex,
+										int32_t iMaxNumCoeff,
+										const uint8_t *kpZigzagTable,
+										int32_t iResidualProperty,
+										/*short *tCoeffLevel,*/
+										int16_t *pTCoeff,
+										int32_t iMbMode,
+										uint8_t uiQp,
+										PWelsDecoderContext pCtx);
+
+/*!
+ * \brief   parsing intra mode 
+ * \param 	input : current mb, bit-stream
+ * \param 	output: 0 indicating decoding correctly; -1 means error
+ */
+int32_t ParseIntra4x4ModeConstrain0  (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer);
+int32_t ParseIntra4x4ModeConstrain1  (PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer);
+int32_t ParseIntra16x16ModeConstrain0(PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
+int32_t ParseIntra16x16ModeConstrain1(PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer);
+
+/*!
+ * \brief   parsing inter info (including ref_index and mvd) 
+ * \param 	input : decoding context, current mb, bit-stream
+ * \param 	output: 0 indicating decoding correctly; -1 means error
+ */
+int32_t ParseInterInfo(PWelsDecoderContext pCtx, int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PBitStringAux pBs);
+
+//#pragma pack()
+
+} // namespace WelsDec
+#endif//WELS_PARSE_MB_SYN_CAVLC_H__
--- /dev/null
+++ b/codec/decoder/core/inc/pic_queue.h
@@ -1,0 +1,63 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//pic_queue.h
+#ifndef WELS_PICTURE_QUEUE_H__
+#define WELS_PICTURE_QUEUE_H__
+
+
+#include "picture.h"
+
+//#pragma pack(1)
+
+namespace WelsDec {
+
+#define   PICTURE_RESOLUTION_ALIGNMENT      32
+
+
+typedef struct TagPicBuff{
+	PPicture*      ppPic;   
+	int32_t        iCapacity;  // capacity size of queue
+	int32_t        iCurrentIdx;
+}SPicBuff, *PPicBuff;
+
+/*
+ *	Interfaces
+ */
+
+PPicture PrefetchPic( PPicBuff pPicBuff ); // To get current node applicable
+
+} // namespace WelsDec
+
+//#pragma pack()
+
+#endif//WELS_PICTURE_QUEUE_H__
--- /dev/null
+++ b/codec/decoder/core/inc/picture.h
@@ -1,0 +1,87 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//picture.h	-	reconstruction picture/ reference picture/ residual picture are declared here
+#ifndef WELS_PICTURE_H__
+#define WELS_PICTURE_H__
+
+#include "typedefs.h"
+
+//#pragma pack(1)
+
+namespace WelsDec {
+
+/*
+ *	Reconstructed Picture definition
+ *	It is used to express reference picture, also consequent reconstruction picture for output
+ */
+typedef struct TagPicture{
+	/************************************payload data*********************************/
+	uint8_t		*pBuffer[4];		// pointer to the first allocated byte, basical offset of buffer, dimension:
+	uint8_t		*pData[4];		// pointer to picture planes respectively
+	int32_t		iLinesize[4];// linesize of picture planes respectively used currently
+	int32_t		iPlanes;			// How many planes are introduced due to color space format?
+	// picture information
+	
+	/*******************************from other standard syntax****************************/
+	/*from sps*/
+	int32_t		iWidthInPixel;	// picture width in pixel
+	int32_t		iHeightInPixel;// picture height in pixel
+	/*from slice header*/
+	int32_t		iFramePoc;		// frame POC
+
+	/*******************************sef_definition for misc use****************************/
+	bool_t		bUsedAsRef;							//for ref pic management
+	bool_t		bIsLongRef;	// long term reference frame flag	//for ref pic management
+	uint8_t		uiRefCount;
+	bool_t		bAvailableFlag;	// indicate whether it is available in this picture memory block.
+
+	/*******************************for future use****************************/
+	uint8_t		uiTemporalId;
+	uint8_t		uiSpatialId;
+	uint8_t		uiQualityId;
+	bool_t		bRefBaseFlag;
+	
+	int32_t		iFrameNum;		// frame number			//for ref pic management
+	int32_t		iLongTermFrameIdx;					//id for long term ref pic
+
+	int32_t     iTotalNumMbRec; //show how many MB constructed
+
+	int32_t     iSpsId; //against mosaic caused by cross-IDR interval reference.
+	int32_t     iPpsId;
+}SPicture, *PPicture;	// "Picture" declaration is comflict with Mac system
+
+} // namespace WelsDec
+
+//#pragma pack()
+
+#endif//WELS_PICTURE_H__
--- /dev/null
+++ b/codec/decoder/core/inc/rec_mb.h
@@ -1,0 +1,76 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	rec_mb.h
+ *
+ * \brief	interfaces for all macroblock decoding process after mb syntax parsing and residual decoding with cavlc.
+ *
+ * \date	3/4/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_REC_MB_H__
+#define WELS_REC_MB_H__
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "error_code.h"
+
+#include "decoder_context.h"
+
+//#pragma pack(1)
+
+namespace WelsDec {
+
+void_t WelsFillRecNeededMbInfo(PWelsDecoderContext pCtx, bool_t bOutput, PDqLayer pCurLayer);
+
+int32_t RecI4x4Mb    (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+
+int32_t RecI4x4Luma  (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+
+int32_t RecI4x4Chroma(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+
+int32_t RecI16x16Mb  (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+
+int32_t RecChroma    (int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer);
+
+void_t GetInterPred (uint8_t *pPredY, uint8_t *pPredCb, uint8_t *pPredCr, PWelsDecoderContext pCtx);
+
+void_t FillBufForMc(uint8_t *pBuf, int32_t iBufStride, uint8_t *pSrc, int32_t iSrcStride, int32_t iSrcOffset, 
+					 int32_t iBlockWidth, int32_t iBlockHeight, int32_t iSrcX, int32_t iSrcY, int32_t iPicWidth, int32_t iPicHeight);
+
+} // namespace WelsDec
+
+//#pragma pack()
+
+#endif //WELS_REC_MB_H__
+
--- /dev/null
+++ b/codec/decoder/core/inc/slice.h
@@ -1,0 +1,207 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//wels_slice.h
+#ifndef WELS_SLICE_H__
+#define WELS_SLICE_H__
+
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "picture.h"
+#include "parameter_sets.h"
+
+//#pragma pack(1)
+
+namespace WelsDec {
+
+/*
+ *	Reference picture list reordering syntax, refer to page 64 in JVT X201wcm
+ */
+typedef struct TagRefPicListReorderSyntax {
+	struct {
+		uint32_t    uiAbsDiffPicNumMinus1;
+		uint16_t    uiLongTermPicNum;
+		uint16_t    uiReorderingOfPicNumsIdc;
+	} sReorderingSyn[LIST_A][MAX_REF_PIC_COUNT];
+	bool_t		bRefPicListReorderingFlag[LIST_A];
+}SRefPicListReorderSyn, *PRefPicListReorderSyn;
+
+/*
+ *	Prediction weight table syntax, refer to page 65 in JVT X201wcm
+ */
+typedef struct TagPredWeightTabSyntax{
+	uint32_t	uiLumaLog2WeightDenom;
+	uint32_t	uiChromaLog2WeightDenom;
+	struct{
+		int32_t	iLumaWeight[MAX_REF_PIC_COUNT];
+		int32_t iLumaOffset[MAX_REF_PIC_COUNT];
+		int32_t	iChromaWeight[MAX_REF_PIC_COUNT][2];
+		int32_t iChromaOffset[MAX_REF_PIC_COUNT][2];
+		bool_t	bLumaWeightFlag;
+		bool_t	bChromaWeightFlag;		
+	}sPredList[LIST_A];
+}SPredWeightTabSyn;
+
+/* Decoded reference picture marking syntax, refer to Page 66 in JVT X201wcm */
+typedef struct TagRefPicMarking {
+	struct {
+		uint32_t    uiMmcoType;
+		int32_t     iShortFrameNum;
+		int32_t	    iDiffOfPicNum;
+		uint32_t    uiLongTermPicNum;
+		int32_t	    iLongTermFrameIdx;
+		int32_t	    iMaxLongTermFrameIdx;
+	} sMmcoRef[MAX_MMCO_COUNT];
+
+    bool_t		bNoOutputOfPriorPicsFlag;
+	bool_t		bLongTermRefFlag;
+	bool_t		bAdaptiveRefPicMarkingModeFlag;	
+} SRefPicMarking, *PRefPicMarking;
+
+/* Decode reference base picture marking syntax in Page 396 of JVT X201wcm */
+typedef struct TagRefBasePicMarkingSyn {
+	struct {
+		uint32_t	uiMmcoType;
+		int32_t	    iShortFrameNum;
+		uint32_t	uiDiffOfPicNums;
+		uint32_t	uiLongTermPicNum; //should uint32_t, cover larger range of iFrameNum.
+	} mmco_base[MAX_MMCO_COUNT];	// MAX_REF_PIC for reference picture based on frame
+
+    bool_t		bAdaptiveRefBasePicMarkingModeFlag;
+} SRefBasePicMarking, *PRefBasePicMarking;
+
+/* Header of slice syntax elements, refer to Page 63 in JVT X201wcm */
+typedef struct TagSliceHeaders{	
+	/*****************************slice header syntax and generated****************************/
+	int32_t		iFirstMbInSlice;		
+	int32_t		iFrameNum;
+	int32_t		iPicOrderCntLsb;
+	int32_t		iDeltaPicOrderCntBottom;
+	int32_t		iDeltaPicOrderCnt[2];
+	int32_t		iRedundantPicCnt;
+	int32_t		uiRefCount[LIST_A];
+	int32_t		iSliceQpDelta;	//no use for iSliceQp is used directly
+	int32_t		iSliceQp;	
+	int32_t		iSliceQsDelta;	// For SP/SI slices
+	uint32_t	uiDisableDeblockingFilterIdc;
+	int32_t		iSliceAlphaC0Offset;
+	int32_t		iSliceBetaOffset;
+	int32_t		iSliceGroupChangeCycle;
+
+	PSps		pSps;
+	PPps		pPps;
+	int32_t	    iSpsId;
+	int32_t	    iPpsId;
+
+	/*********************got from other layer for efficency if possible*********************/
+	SRefPicListReorderSyn	pRefPicListReordering;	// Reference picture list reordering syntaxs
+	SPredWeightTabSyn		sPredWeightTable;
+	int32_t		iCabacInitIdc;
+	int32_t		iMbWidth;	//from?
+	int32_t		iMbHeight; //from?
+	SRefPicMarking		sRefMarking;	// Decoded reference picture marking syntaxs
+
+	uint16_t    uiIdrPicId;
+	ESliceType	eSliceType;
+	bool_t		bNumRefIdxActiveOverrideFlag;
+	bool_t		bFieldPicFlag;		//not supported in base profile
+	bool_t		bBottomFiledFlag;		//not supported in base profile
+	uint8_t		uiPadding1Byte;
+	bool_t		bSpForSwitchFlag;			// For SP/SI slices
+	int16_t		iPadding2Bytes;
+}SSliceHeader, *PSliceHeader;
+
+
+/* Slice header in scalable extension syntax, refer to Page 394 in JVT X201wcm */
+typedef struct TagSliceHeaderExt{	
+	SSliceHeader	sSliceHeader;
+	PSubsetSps	pSubsetSps;
+	
+	uint32_t	uiNumMbsInSlice;
+	uint32_t	uiDisableInterLayerDeblockingFilterIdc;
+	int32_t		iInterLayerSliceAlphaC0Offset;
+	int32_t		iInterLayerSliceBetaOffset;	
+	
+	//SPosOffset sScaledRefLayer;
+	int32_t		iScaledRefLayerPicWidthInSampleLuma;
+	int32_t		iScaledRefLayerPicHeightInSampleLuma;
+
+	SRefBasePicMarking	sRefBasePicMarking;
+	bool_t		bBasePredWeightTableFlag;
+	bool_t		bStoreRefBasePicFlag;	
+	bool_t		bConstrainedIntraResamplingFlag;	
+	bool_t		bSliceSkipFlag;
+	
+	bool_t		bAdaptiveBaseModeFlag;
+	bool_t		bDefaultBaseModeFlag;
+	bool_t		bAdaptiveMotionPredFlag;
+	bool_t		bDefaultMotionPredFlag;
+	bool_t		bAdaptiveResidualPredFlag;
+	bool_t		bDefaultResidualPredFlag;
+	bool_t		bTCoeffLevelPredFlag;		
+	uint8_t		uiRefLayerChromaPhaseXPlus1Flag;
+	
+	uint8_t		uiRefLayerChromaPhaseYPlus1;
+	uint8_t		uiRefLayerDqId;
+	uint8_t		uiScanIdxStart;
+	uint8_t		uiScanIdxEnd;
+}SSliceHeaderExt, *PSliceHeaderExt;
+
+
+typedef struct TagSlice{	
+	/*******************************slice_header****************************/
+	SSliceHeaderExt	sSliceHeaderExt;		
+	
+	/*******************************use for future****************************/
+	// for Macroblock coding within slice
+	int32_t		iLastMbQp;		// stored qp for last mb coded, maybe more efficient for mb skip detection etc.
+
+	/*******************************slice_data****************************/
+	/*slice_data_ext()*/
+	int32_t		iMbSkipRun;
+	int32_t     iTotalMbInCurSlice; //record the total number of MB in current slice.
+	
+	/*slice_data_ext() generate*/
+		
+	/*******************************misc use****************************/
+	bool_t		bSliceHeaderExtFlag; // Indicate which slice header is used, avc or ext?
+	/*************got from other layer for effiency if possible***************/
+	/*from lower layer: slice header*/
+	uint8_t		eSliceType;	
+	uint8_t		uiPadding[2];	
+}SSlice, *PSlice;
+
+} // namespace WelsDec
+
+//#pragma pack()
+#endif//WELS_SLICE_H__
--- /dev/null
+++ b/codec/decoder/core/inc/typedefs.h
@@ -1,0 +1,91 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// typedef.h
+#ifndef WELS_TYPE_DEFINES_H__
+#define WELS_TYPE_DEFINES_H__
+
+#include <limits.h>
+
+////////////////////////////////////////////////////////////////////////////
+// NOTICE : ALL internal implement MUST use the data type defined as below
+//          ONLY except with the interface file !!!!!
+////////////////////////////////////////////////////////////////////////////
+
+#ifndef  _MSC_VER
+
+#include <stdint.h>
+
+#else
+
+// FIXME:     all singed type should be declared explicit,  for example,  int8_t should be declared as signed char.  
+typedef signed char      int8_t  ;
+typedef unsigned char    uint8_t ;
+typedef short            int16_t ;
+typedef unsigned short   uint16_t;
+typedef int              int32_t ;
+typedef unsigned int     uint32_t;
+typedef __int64          int64_t ;
+typedef unsigned __int64 uint64_t;
+
+#endif // _MSC_VER defined
+
+// FIXME:     all string type should be declared explicit as char. 
+typedef char      str_t;
+typedef float     real32_t;
+
+#ifdef PESN
+#undef PESN
+#endif//PESN
+#define PESN	  (0.000001f) // (1e-6)	// desired float precision
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+typedef bool bool_t;
+typedef int32_t BOOL_T;
+
+#ifndef FALSE
+#define FALSE   ((int32_t)0)
+#endif//FALSE
+
+#ifndef TRUE
+#define TRUE    ((int32_t)1)
+#endif//TRUE
+
+#ifndef void_t
+#define void_t void
+#endif
+
+#endif //WELS_TYPE_DEFINES_H__
+
--- /dev/null
+++ b/codec/decoder/core/inc/utils.h
@@ -1,0 +1,99 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	utils.h
+ *
+ * \brief	Tool kits for decoder
+ *		( malloc, realloc, free, log output and PSNR calculation and so on )
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_UTILS_H__
+#define WELS_UTILS_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include "typedefs.h"
+
+namespace WelsDec {
+
+#ifdef __cplusplus
+extern "C" {
+#endif//__cplusplus
+
+// cache line size
+extern uint32_t g_uiCacheLineSize;
+
+/*
+ *	Function pointer declaration for various tool sets
+ */
+// wels log output
+typedef void_t (*PWelsLogCallbackFunc)(void_t *pPtr, const int32_t kiLevel, const char *kpFmt, va_list pArgv);
+
+extern PWelsLogCallbackFunc	g_pLog;
+
+#ifdef __GNUC__
+extern void_t WelsLog(void_t *pPtr, int32_t iLevel, const char *kpFmt, ...) __attribute__ ((__format__ (__printf__, 3, 4)));
+#else
+extern void_t WelsLog(void_t *pPtr, int32_t iLevel, const char *kpFmt, ...);
+#endif
+	
+#define DECODER_MODE_NAME(a) ((a == SW_MODE)?"SW_MODE":((a == GPU_MODE)?"GPU_MODE":((a == AUTO_MODE)?"AUTO_MODE":"SWITCH_MODE")))
+#define OUTPUT_PROPERTY_NAME(a) ((a == 0)?"system_memory":"video_memory")
+#define BUFFER_STATUS_NAME(a) ((a == 0)?"unvalid":"valid")
+
+
+/*
+ *	Log output routines
+ */
+
+typedef int32_t	WelsLogLevel;
+enum{
+	WELS_LOG_QUIET		= 0x00,		// Quiet mode
+	WELS_LOG_ERROR		= 1 << 0,	// Error log level
+	WELS_LOG_WARNING	= 1 << 1,	// Warning log level
+	WELS_LOG_INFO		= 1 << 2,	// Information log level
+	WELS_LOG_DEBUG		= 1 << 3,	// Debug log level
+	WELS_LOG_RESV		= 1 << 4,	// Resversed log level
+	WELS_LOG_LEVEL_COUNT= 5,
+	WELS_LOG_DEFAULT	= WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG	// Default log level in Wels codec
+};
+
+#ifdef __cplusplus
+}
+#endif//__cplusplus
+
+} // namespace WelsDec
+
+#endif//WELS_UTILS_H__
--- /dev/null
+++ b/codec/decoder/core/inc/vlc_decoder.h
@@ -1,0 +1,176 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_VLC_DECODER_H__
+#define WELS_VLC_DECODER_H__
+
+#include "bit_stream.h"
+#include "dec_golomb.h"
+
+namespace WelsDec {
+
+typedef struct TagVlcTable
+{
+	const uint8_t (*kpCoeffTokenVlcTable[4][8])[2];
+	const uint8_t (*kpChromaCoeffTokenVlcTable)[2];
+	const uint8_t (*kpZeroTable[7])[2];
+	const uint8_t (*kpTotalZerosTable[2][15])[2];
+}SVlcTable;
+
+// for data sharing cross modules and try to reduce size of binary generated
+extern const uint8_t g_kuiVlcChromaTable[256][2];
+extern const uint8_t g_kuiVlcTable_0[256][2];
+extern const uint8_t g_kuiVlcTable_0_0[256][2];
+extern const uint8_t g_kuiVlcTable_0_1[4][2];
+extern const uint8_t g_kuiVlcTable_0_2[2][2];
+extern const uint8_t g_kuiVlcTable_0_3[2][2];
+extern const uint8_t g_kuiVlcTable_1[256][2];
+extern const uint8_t g_kuiVlcTable_1_0[64][2];
+extern const uint8_t g_kuiVlcTable_1_1[8][2];
+extern const uint8_t g_kuiVlcTable_1_2[2][2];
+extern const uint8_t g_kuiVlcTable_1_3[2][2];
+extern const uint8_t g_kuiVlcTable_2[256][2];
+extern const uint8_t g_kuiVlcTable_2_0[4][2];
+extern const uint8_t g_kuiVlcTable_2_1[4][2];
+extern const uint8_t g_kuiVlcTable_2_2[4][2];
+extern const uint8_t g_kuiVlcTable_2_3[4][2];
+extern const uint8_t g_kuiVlcTable_2_4[2][2];
+extern const uint8_t g_kuiVlcTable_2_5[2][2];
+extern const uint8_t g_kuiVlcTable_2_6[2][2];
+extern const uint8_t g_kuiVlcTable_2_7[2][2];
+extern const uint8_t g_kuiVlcTable_3[64][2];
+extern const uint8_t g_kuiVlcTableNeedMoreBitsThread[3];
+extern const uint8_t g_kuiVlcTableMoreBitsCount0[4];
+extern const uint8_t g_kuiVlcTableMoreBitsCount1[4];
+extern const uint8_t g_kuiVlcTableMoreBitsCount2[8];
+extern const uint8_t g_kuiNcMapTable[17];
+extern const uint8_t g_kuiVlcTrailingOneTotalCoeffTable[62][2];
+extern const uint8_t g_kuiTotalZerosTable0[512][2];
+extern const uint8_t g_kuiTotalZerosTable1[64][2];
+extern const uint8_t g_kuiTotalZerosTable2[64][2];
+extern const uint8_t g_kuiTotalZerosTable3[32][2];
+extern const uint8_t g_kuiTotalZerosTable4[32][2];
+extern const uint8_t g_kuiTotalZerosTable5[64][2];
+extern const uint8_t g_kuiTotalZerosTable6[64][2];
+extern const uint8_t g_kuiTotalZerosTable7[64][2];
+extern const uint8_t g_kuiTotalZerosTable8[64][2];
+extern const uint8_t g_kuiTotalZerosTable9[32][2];
+extern const uint8_t g_kuiTotalZerosTable10[16][2];
+extern const uint8_t g_kuiTotalZerosTable11[16][2];
+extern const uint8_t g_kuiTotalZerosTable12[8][2];
+extern const uint8_t g_kuiTotalZerosTable13[4][2];
+extern const uint8_t g_kuiTotalZerosTable14[2][2];
+extern const uint8_t g_kuiTotalZerosBitNumMap[15];
+extern const uint8_t g_kuiTotalZerosChromaTable0[8][2];
+extern const uint8_t g_kuiTotalZerosChromaTable1[4][2];
+extern const uint8_t g_kuiTotalZerosChromaTable2[2][2];
+extern const uint8_t g_kuiTotalZerosBitNumChromaMap[3];
+extern const uint8_t g_kuiZeroLeftTable0[2][2];
+extern const uint8_t g_kuiZeroLeftTable1[4][2];
+extern const uint8_t g_kuiZeroLeftTable2[4][2];
+extern const uint8_t g_kuiZeroLeftTable3[8][2];
+extern const uint8_t g_kuiZeroLeftTable4[8][2];
+extern const uint8_t g_kuiZeroLeftTable5[8][2];
+extern const uint8_t g_kuiZeroLeftTable6[8][2];
+extern const uint8_t g_kuiZeroLeftBitNumMap[16];
+
+#ifdef WIN32
+//TODO need linux version
+#define WELS_GET_PREFIX_BITS(inval,outval){\
+	__asm xor		eax,	eax\
+	__asm bsr		eax,	inval\
+	__asm sub		eax,	32\
+	__asm neg		eax\
+	__asm mov		outval,	eax\
+}
+#endif
+
+static inline void_t InitVlcTable(SVlcTable * pVlcTable)
+{
+	pVlcTable->kpChromaCoeffTokenVlcTable = g_kuiVlcChromaTable;
+	
+	pVlcTable->kpCoeffTokenVlcTable[0][0] = g_kuiVlcTable_0;
+	pVlcTable->kpCoeffTokenVlcTable[0][1] = g_kuiVlcTable_1;
+	pVlcTable->kpCoeffTokenVlcTable[0][2] = g_kuiVlcTable_2;
+	pVlcTable->kpCoeffTokenVlcTable[0][3] = g_kuiVlcTable_3;
+	
+	pVlcTable->kpCoeffTokenVlcTable[1][0] = g_kuiVlcTable_0_0;
+	pVlcTable->kpCoeffTokenVlcTable[1][1] = g_kuiVlcTable_0_1;
+	pVlcTable->kpCoeffTokenVlcTable[1][2] = g_kuiVlcTable_0_2;
+	pVlcTable->kpCoeffTokenVlcTable[1][3] = g_kuiVlcTable_0_3;
+	
+	pVlcTable->kpCoeffTokenVlcTable[2][0] = g_kuiVlcTable_1_0;
+	pVlcTable->kpCoeffTokenVlcTable[2][1] = g_kuiVlcTable_1_1;
+	pVlcTable->kpCoeffTokenVlcTable[2][2] = g_kuiVlcTable_1_2;
+	pVlcTable->kpCoeffTokenVlcTable[2][3] = g_kuiVlcTable_1_3;
+	
+	pVlcTable->kpCoeffTokenVlcTable[3][0] = g_kuiVlcTable_2_0;
+	pVlcTable->kpCoeffTokenVlcTable[3][1] = g_kuiVlcTable_2_1;
+	pVlcTable->kpCoeffTokenVlcTable[3][2] = g_kuiVlcTable_2_2;
+	pVlcTable->kpCoeffTokenVlcTable[3][3] = g_kuiVlcTable_2_3;
+	pVlcTable->kpCoeffTokenVlcTable[3][4] = g_kuiVlcTable_2_4;
+	pVlcTable->kpCoeffTokenVlcTable[3][5] = g_kuiVlcTable_2_5;
+	pVlcTable->kpCoeffTokenVlcTable[3][6] = g_kuiVlcTable_2_6;
+	pVlcTable->kpCoeffTokenVlcTable[3][7] = g_kuiVlcTable_2_7;
+	
+	pVlcTable->kpZeroTable[0] = g_kuiZeroLeftTable0;
+	pVlcTable->kpZeroTable[1] = g_kuiZeroLeftTable1;
+	pVlcTable->kpZeroTable[2] = g_kuiZeroLeftTable2;
+	pVlcTable->kpZeroTable[3] = g_kuiZeroLeftTable3;
+	pVlcTable->kpZeroTable[4] = g_kuiZeroLeftTable4;
+	pVlcTable->kpZeroTable[5] = g_kuiZeroLeftTable5;
+	pVlcTable->kpZeroTable[6] = g_kuiZeroLeftTable6;
+
+	pVlcTable->kpTotalZerosTable[0][0] = g_kuiTotalZerosTable0;
+	pVlcTable->kpTotalZerosTable[0][1] = g_kuiTotalZerosTable1;
+	pVlcTable->kpTotalZerosTable[0][2] = g_kuiTotalZerosTable2;
+	pVlcTable->kpTotalZerosTable[0][3] = g_kuiTotalZerosTable3;
+	pVlcTable->kpTotalZerosTable[0][4] = g_kuiTotalZerosTable4;
+	pVlcTable->kpTotalZerosTable[0][5] = g_kuiTotalZerosTable5;
+	pVlcTable->kpTotalZerosTable[0][6] = g_kuiTotalZerosTable6;
+	pVlcTable->kpTotalZerosTable[0][7] = g_kuiTotalZerosTable7;
+	pVlcTable->kpTotalZerosTable[0][8] = g_kuiTotalZerosTable8;
+	pVlcTable->kpTotalZerosTable[0][9] = g_kuiTotalZerosTable9;
+	pVlcTable->kpTotalZerosTable[0][10] = g_kuiTotalZerosTable10;
+	pVlcTable->kpTotalZerosTable[0][11] = g_kuiTotalZerosTable11;
+	pVlcTable->kpTotalZerosTable[0][12] = g_kuiTotalZerosTable12;
+	pVlcTable->kpTotalZerosTable[0][13] = g_kuiTotalZerosTable13;
+	pVlcTable->kpTotalZerosTable[0][14] = g_kuiTotalZerosTable14;
+	pVlcTable->kpTotalZerosTable[1][0] = g_kuiTotalZerosChromaTable0;
+	pVlcTable->kpTotalZerosTable[1][1] = g_kuiTotalZerosChromaTable1;
+	pVlcTable->kpTotalZerosTable[1][2] = g_kuiTotalZerosChromaTable2;
+
+}
+
+} // namespace WelsDec
+
+#endif//WELS_VLC_DECODER_H__
--- /dev/null
+++ b/codec/decoder/core/inc/wels_common_basis.h
@@ -1,0 +1,298 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//wels_common_basis.h
+#ifndef WELS_COMMON_BASIS_H__
+#define WELS_COMMON_BASIS_H__
+
+#include "typedefs.h"
+#include "macros.h"
+
+namespace WelsDec {
+
+// for data sharing cross modules and try to reduce size of binary generated
+
+extern const uint8_t g_kuiChromaQp[52];
+
+/*common use table*/
+extern const uint8_t g_kuiScan8[24];
+extern const uint8_t g_kuiLumaDcZigzagScan[16];
+extern const uint8_t g_kuiChromaDcScan[4];
+extern __align16( const uint16_t, g_kuiDequantCoeff[52][8]);
+/* Profile IDC */
+typedef uint8_t		ProfileIdc;
+enum{
+	PRO_BASELINE	= 66,
+	PRO_MAIN		= 77,
+	PRO_EXTENDED	= 88,
+	PRO_HIGH		= 100,
+	PRO_HIGH10		= 110,
+	PRO_HIGH422		= 122,
+	PRO_HIGH444		= 144,
+	PRO_CAVLC444	= 244,
+	
+	PRO_SCALABLE_BASELINE	= 83,
+	PRO_SCALABLE_HIGH		= 86,
+};
+
+/*
+ *	NAL Unit Type (5 Bits)
+ */
+typedef enum TagNalUnitType
+{
+	NAL_UNIT_UNSPEC_0			= 0,
+	NAL_UNIT_CODED_SLICE		= 1,
+	NAL_UNIT_CODED_SLICE_DPA	= 2,
+	NAL_UNIT_CODED_SLICE_DPB	= 3,
+	NAL_UNIT_CODED_SLICE_DPC	= 4,
+	NAL_UNIT_CODED_SLICE_IDR	= 5,
+	NAL_UNIT_SEI				= 6,
+	NAL_UNIT_SPS				= 7,
+	NAL_UNIT_PPS				= 8,
+	NAL_UNIT_AU_DELIMITER		= 9,
+	NAL_UNIT_END_OF_SEQ			= 10,
+	NAL_UNIT_END_OF_STR			= 11,
+	NAL_UNIT_FILLER_DATA		= 12,
+	NAL_UNIT_SPS_EXT			= 13,
+	NAL_UNIT_PREFIX				= 14,
+	NAL_UNIT_SUBSET_SPS			= 15,
+	NAL_UNIT_RESV_16			= 16,
+	NAL_UNIT_RESV_17			= 17,
+	NAL_UNIT_RESV_18			= 18,
+	NAL_UNIT_AUX_CODED_SLICE	= 19,
+	NAL_UNIT_CODED_SLICE_EXT	= 20,
+	NAL_UNIT_RESV_21			= 21,
+	NAL_UNIT_RESV_22			= 22,
+	NAL_UNIT_RESV_23			= 23,
+	NAL_UNIT_UNSPEC_24			= 24,
+	NAL_UNIT_UNSPEC_25			= 25,
+	NAL_UNIT_UNSPEC_26			= 26,
+	NAL_UNIT_UNSPEC_27			= 27,
+	NAL_UNIT_UNSPEC_28			= 28,
+	NAL_UNIT_UNSPEC_29			= 29,
+	NAL_UNIT_UNSPEC_30			= 30,
+	NAL_UNIT_UNSPEC_31			= 31
+}ENalUnitType;
+
+static const uint8_t g_kuiEmulationPreventionThreeByte	= 0x03;
+
+/*
+ *	NAL Reference IDC (2 Bits)
+ */
+typedef uint8_t		NalRefIdc;
+enum{
+	NRI_PRI_LOWEST	= 0,
+	NRI_PRI_LOW		= 1,
+	NRI_PRI_HIGH	= 2,
+	NRI_PRI_HIGHEST	= 3
+};
+
+/*
+ * VCL TYPE	
+ */
+typedef uint8_t		VclType;
+enum{
+	NON_VCL			= 0,
+	VCL				= 1,
+	NOT_APP			= 2
+};
+
+/*
+ *	vcl type map for given NAL unit type and corresponding H264 type
+ */
+extern const VclType g_kuiVclTypeMap[32][2];  
+
+#define IS_VCL_NAL(t, ext_idx)			(g_kuiVclTypeMap[t][ext_idx] == VCL)
+#define IS_PARAM_SETS_NALS(t)			( (t) == NAL_UNIT_SPS || (t) == NAL_UNIT_PPS || (t) == NAL_UNIT_SUBSET_SPS )
+#define IS_SPS_NAL(t)					( (t) == NAL_UNIT_SPS )
+#define IS_SUBSET_SPS_NAL(t)			( (t) == NAL_UNIT_SUBSET_SPS )
+#define IS_PPS_NAL(t)					( (t) == NAL_UNIT_PPS )
+#define IS_SEI_NAL(t)					( (t) == NAL_UNIT_SEI )
+#define IS_PREFIX_NAL(t)				( (t) == NAL_UNIT_PREFIX )
+#define IS_SUBSET_SPS_USED(t)			( (t) == NAL_UNIT_SUBSET_SPS || (t) == NAL_UNIT_CODED_SLICE_EXT )
+#define IS_VCL_NAL_AVC_BASE(t)			( (t) == NAL_UNIT_CODED_SLICE || (t) == NAL_UNIT_CODED_SLICE_IDR )
+#define IS_NEW_INTRODUCED_NAL(t)	( (t) == NAL_UNIT_PREFIX || (t) == NAL_UNIT_CODED_SLICE_EXT )
+
+/* Base Slice Types
+ * Invalid in case of eSliceType exceeds 9,
+ * Need trim when eSliceType > 4 as fixed SliceType(eSliceType-4),
+ * meaning mapped version after eSliceType minus 4.
+ */
+typedef enum TagSliceType{
+	P_SLICE	= 0,
+	B_SLICE	= 1,
+	I_SLICE	= 2,
+	SP_SLICE= 3,
+	SI_SLICE= 4,
+	UNKNOWN_SLICE= 5
+}ESliceType;
+
+/* Slice Types in scalable extension */
+typedef uint8_t		SliceTypeExt;
+enum{
+	EP_SLICE = 0,	// EP_SLICE: 0, 5
+	EB_SLICE = 1,	// EB_SLICE: 1, 6
+	EI_SLICE = 2	// EI_SLICE: 2, 7
+};
+
+/* List Index */
+typedef uint8_t		ListIndex;
+enum{
+	LIST_0	= 0,
+	LIST_1	= 1,
+	LIST_A	= 2
+};
+
+/* Picture Size */
+typedef struct TagPictureSize{
+	int32_t	iWidth;
+	int32_t iHeight;
+}SPictureSize;
+
+/* Motion Vector components */
+typedef uint8_t		MvComp;
+enum{
+	MV_X	= 0,
+	MV_Y	= 1,
+	MV_A	= 2
+};
+
+/* Chroma Components */
+typedef uint8_t		ChromaComp;
+enum{
+	CHROMA_CB	= 0,
+	CHROMA_CR	= 1,
+	CHROMA_A	= 2
+};
+
+/* Position Offset structure */
+typedef struct TagPosOffset{
+	int32_t	iLeftOffset;
+	int32_t	iTopOffset;
+	int32_t	iRightOffset;
+	int32_t	iBottomOffset;
+}SPosOffset;
+
+enum EMbPosition //
+{
+    MB_LEFT     = 0x01,	// A
+    MB_TOP      = 0x02,	// B
+    MB_TOPRIGHT = 0x04,	// C
+	MB_TOPLEFT	= 0x08,	// D,
+    MB_PRIVATE  = 0x10,
+};
+/* MB Type & Sub-MB Type */
+typedef int32_t MbType;
+typedef int32_t SubMbType;
+
+#define MB_TYPE_INTRA4x4       0x01
+#define MB_TYPE_INTRA16x16     0x02
+#define MB_TYPE_INTRA8x8       0x03
+#define MB_TYPE_INTRA_PCM      0x04
+
+#define MB_TYPE_INTRA_BL       0x05// I_BL new MB type
+
+#define MB_TYPE_16x16          0x06
+#define MB_TYPE_16x8           0x07
+#define MB_TYPE_8x16           0x08
+#define MB_TYPE_8x8            0x09
+#define MB_TYPE_8x8_REF0       0x0a
+
+#define SUB_MB_TYPE_8x8        0x0b
+#define SUB_MB_TYPE_8x4        0x0c
+#define SUB_MB_TYPE_4x8        0x0d
+#define SUB_MB_TYPE_4x4        0x0e
+#define MB_TYPE_SKIP           0x0f
+#define MB_TYPE_DIRECT2        0x10
+#define not_available		   0x20
+
+#define IS_INTRA4x4(type) ( MB_TYPE_INTRA4x4 == (type) )
+#define IS_INTRA16x16(type) ( MB_TYPE_INTRA16x16 == (type) )
+#define IS_INTRA(type) ( (type) > 0 && (type) < 5 )
+#define IS_INTER(type) ( (type) > 5 && (type) < 16 )
+
+#define IS_I_BL(type) ( (type) == MB_TYPE_INTRA_BL )
+#define IS_SUB8x8(type) (MB_TYPE_8x8 == (type) || MB_TYPE_8x8_REF0 == (type))
+
+/*
+ *	Memory Management Control Operation (MMCO) code
+ */
+enum{
+	MMCO_END			=0,
+	MMCO_SHORT2UNUSED	=1,
+	MMCO_LONG2UNUSED	=2,
+	MMCO_SHORT2LONG		=3,
+	MMCO_SET_MAX_LONG	=4,
+	MMCO_RESET			=5,
+	MMCO_LONG			=6
+};
+
+/////////intra16x16  Luma
+#define I16_PRED_V       0
+#define I16_PRED_H       1
+#define I16_PRED_DC      2
+#define I16_PRED_P       3
+
+#define I16_PRED_DC_L    4
+#define I16_PRED_DC_T    5
+#define I16_PRED_DC_128  6
+//////////intra4x4   Luma
+#define I4_PRED_V        0
+#define I4_PRED_H        1
+#define I4_PRED_DC       2
+#define I4_PRED_DDL      3 //diagonal_down_left
+#define I4_PRED_DDR      4 //diagonal_down_right
+#define I4_PRED_VR       5 //vertical_right
+#define I4_PRED_HD       6 //horizon_down
+#define I4_PRED_VL       7 //vertical_left
+#define I4_PRED_HU       8 //horizon_up
+
+#define I4_PRED_DC_L     9
+#define I4_PRED_DC_T     10
+#define I4_PRED_DC_128   11
+
+#define I4_PRED_DDL_TOP  12 //right-top replacing by padding rightmost pixel of top
+#define I4_PRED_VL_TOP   13 //right-top replacing by padding rightmost pixel of top
+
+//////////intra Chroma
+#define C_PRED_DC        0
+#define C_PRED_H         1
+#define C_PRED_V         2
+#define C_PRED_P         3
+
+#define C_PRED_DC_L      4
+#define C_PRED_DC_T      5
+#define C_PRED_DC_128    6 
+
+} // namespace WelsDec
+
+#endif//WELS_COMMON_BASIS_H__
--- /dev/null
+++ b/codec/decoder/core/inc/wels_const.h
@@ -1,0 +1,104 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//wels_const.h
+#ifndef WELS_CONSTANCE_H__
+#define WELS_CONSTANCE_H__
+
+// Miscellaneous sizing infos  
+#ifndef MAX_FNAME_LEN
+#define MAX_FNAME_LEN		256	// maximal length of file name in char size
+#endif//MAX_FNAME_LEN
+
+#ifndef WELS_LOG_BUF_SIZE
+#define WELS_LOG_BUF_SIZE	4096
+#endif//WELS_LOG_BUF_SIZE
+
+#ifndef MAX_TRACE_LOG_SIZE
+#define MAX_TRACE_LOG_SIZE	(50 * (1<<20))	// max trace log size: 50 MB, overwrite occur if log file size exceeds this size
+#endif//MAX_TRACE_LOG_SIZE
+
+/* MB width in pixels for specified colorspace I420 usually used in codec */
+#define MB_WIDTH_LUMA		16
+#define MB_WIDTH_CHROMA		(MB_WIDTH_LUMA>>1)
+/* MB height in pixels for specified colorspace I420 usually used in codec */
+#define MB_HEIGHT_LUMA		16
+#define MB_HEIGHT_CHROMA	(MB_HEIGHT_LUMA>>1)
+
+/* Some list size */
+#define MB_COEFF_LIST_SIZE	(256+((MB_WIDTH_CHROMA*MB_HEIGHT_CHROMA)<<1))
+
+#define MB_PARTITION_SIZE		4	// Macroblock partition size in 8x8 sub-blocks
+#define MB_SUB_PARTITION_SIZE	4	// Sub partition size in a 8x8 sub-block
+#define MB_BLOCK4x4_NUM				16
+#define MB_BLOCK8x8_NUM				4
+
+#define NAL_UNIT_HEADER_EXT_SIZE	3	// Size of NAL unit header for extension in byte
+
+#define MAX_SPS_COUNT			32	// Count number of SPS
+#define MAX_PPS_COUNT 			256	// Count number of PPS
+
+#define MAX_FRAME_RATE			30	// maximal frame rate to support
+#define MIN_FRAME_RATE			1	// minimal frame rate need support
+
+#define MAX_REF_PIC_COUNT		16		// MAX Short + Long reference pictures
+#define MIN_REF_PIC_COUNT		1		// minimal count number of reference pictures, 1 short + 2 key reference based?
+#define MAX_SHORT_REF_COUNT		16		// maximal count number of short reference pictures
+#define MAX_LONG_REF_COUNT		16		// maximal count number of long reference pictures
+
+#define MAX_MMCO_COUNT			66
+
+#define MAX_SLICEGROUP_IDS		8	// Count number of Slice Groups
+
+#define ALIGN_RBSP_LEN_FIX		4
+
+#define PADDING_LENGTH			32 // reference extension
+
+#define BASE_QUALITY_ID			0
+//#define BASE_DEPENDENCY_ID		0
+#define BASE_DQ_ID				0
+#define MAX_DQ_ID				((uint8_t)-1)
+//#define MAX_LAYER_NUM			(MAX_DEPENDENCY_LAYER * MAX_TEMPORAL_LEVEL * MAX_QUALITY_LEVEL)	// Layer number of Three-tuple
+
+#define LAYER_NUM_EXCHANGEABLE	1
+
+#define MAX_NAL_UNIT_NUM_IN_AU	32	// predefined maximal number of NAL Units in an access unit
+#define MAX_ACCESS_UINT_CAPACITY	1048576	// Maximal AU capacity in bytes: (1<<20) = 1024 KB predefined
+
+enum {
+	BASE_MB = 0,
+	NON_AVC_REWRITE_ENHANCE_MB =1,
+	AVC_REWRITE_ENHANCE_MB = 2
+		
+};
+
+#endif//WELS_CONSTANCE_H__
--- /dev/null
+++ b/codec/decoder/core/src/au_parser.cpp
@@ -1,0 +1,1027 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	au_parser.c
+ *
+ * \brief	Interfaces introduced in Access Unit level based parser
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include "codec_def.h"
+#include "ls_defines.h"
+#include "macros.h"
+#include "au_parser.h"
+#include "decoder.h"
+#include "error_code.h"
+#include "dec_frame.h"
+#include "dec_golomb.h"
+#include "bit_stream.h"
+#include "utils.h"
+#include "codec_app_def.h"
+#include "memmgr_nal_unit.h"
+#include "decoder_core.h"
+#include "wels_common_basis.h"
+#include "decoder_core.h"
+#include "manage_dec_ref.h"
+#include "mem_align.h"
+
+namespace WelsDec {
+/*! 
+ *************************************************************************************
+ * \brief	Start Code Prefix (0x 00 00 00 01) detection
+ *
+ * \param 	pBuf		bitstream payload buffer
+ * \param	pOffset		offset between NAL rbsp and original bitsteam that
+ * 				start code prefix is seperated from. 
+ * \param	iBufSize	count size of buffer
+ *
+ * \return	RBSP buffer of start code prefix exclusive
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+uint8_t* DetectStartCodePrefix( const uint8_t *kpBuf, int32_t *pOffset, int32_t iBufSize )
+{
+	uint8_t *pBits = (uint8_t *)kpBuf;	
+
+	do {
+		int32_t iIdx = 0;
+	    while( (iIdx<iBufSize) && (!(*pBits)) ){
+		    ++ pBits; 
+		    ++ iIdx;
+	    }
+		if( iIdx >= iBufSize )  break;
+
+		++ iIdx;
+		++ pBits;		
+		
+		if( (iIdx>=3) && ((*(pBits-1)) == 0x1) ){		
+			*pOffset = ((uint32_t)pBits) - ((uint32_t)kpBuf);
+            return pBits;
+		}
+		
+		iBufSize -= iIdx;	
+	}  while (1);
+
+	return NULL;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	to parse nal unit
+ *
+ * \param	pCtx		    decoder context
+ * \param 	pNalUnitHeader	parsed result of NAL Unit Header to output
+ * \param   pSrcRbsp        bitstream buffer to input
+ * \param   iSrcRbspLen     length size of bitstream buffer payload
+ * \param	pSrcNal		    
+ * \param	iSrcNalLen		
+ * \param	pConsumedBytes	consumed bytes during parsing
+ *
+ * \return	decoded bytes payload, might be (pSrcRbsp+1) if no escapes 
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+uint8_t* ParseNalHeader( PWelsDecoderContext pCtx, SNalUnitHeader *pNalUnitHeader, uint8_t *pSrcRbsp, int32_t iSrcRbspLen, uint8_t *pSrcNal, int32_t iSrcNalLen, int32_t* pConsumedBytes )
+{
+	PNalUnit pCurNal = NULL;
+	uint8_t* pNal     = pSrcRbsp;
+	int32_t iNalSize  = iSrcRbspLen;
+	PBitStringAux pBs = NULL;
+	bool_t bExtensionFlag = false;
+	int32_t iErr	= ERR_NONE;	
+	int32_t iBitSize = 0;
+	
+	pNalUnitHeader->eNalUnitType = NAL_UNIT_UNSPEC_0;//SHOULD init it. because pCtx->sCurNalHead is common variable.
+
+	//remove the consecutive ZERO at the end of current NAL in the reverse order.--2011.6.1
+	{
+		int32_t iIndex = iSrcRbspLen - 1;
+		uint8_t uiBsZero = 0; 
+		while ( iIndex >= 0 )
+		{
+			uiBsZero = pSrcRbsp[iIndex];
+			if ( 0 == uiBsZero )
+			{
+				--iNalSize;
+				--iIndex;
+			}
+			else
+			{
+				break;
+			}
+		}	
+	}
+	
+	pNalUnitHeader->uiForbiddenZeroBit	= (uint8_t)(pNal[0] >> 7);			// uiForbiddenZeroBit	
+	if ( pNalUnitHeader->uiForbiddenZeroBit )//2010.4.14
+	{
+		return NULL; //uiForbiddenZeroBit should always equal to 0
+	}
+
+	pNalUnitHeader->uiNalRefIdc		= (uint8_t)(pNal[0] >> 5);			// uiNalRefIdc
+	pNalUnitHeader->eNalUnitType		= (ENalUnitType)(pNal[0] & 0x1f);	// eNalUnitType	
+	
+	++pNal;
+	--iNalSize;
+	++(*pConsumedBytes);
+	
+#ifdef DEBUG_PARSE_INFO
+	WelsLog(pCtx, WELS_LOG_INFO, "nal type: %d \n", pNalUnitHeader->eNalUnitType);
+#endif
+	
+	if ( !(IS_SEI_NAL(pNalUnitHeader->eNalUnitType) || IS_SPS_NAL(pNalUnitHeader->eNalUnitType) || pCtx->bSpsExistAheadFlag) )
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, "parse_nal(), no exist Sequence Parameter Sets ahead of sequence when try to decode NAL(type:%d).\n", pNalUnitHeader->eNalUnitType);
+		pCtx->iErrorCode	= dsNoParamSets;
+		return NULL;
+	}
+	if ( !(IS_SEI_NAL(pNalUnitHeader->eNalUnitType) || IS_PARAM_SETS_NALS(pNalUnitHeader->eNalUnitType) || pCtx->bPpsExistAheadFlag) )
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, "parse_nal(), no exist Picture Parameter Sets ahead of sequence when try to decode NAL(type:%d).\n", pNalUnitHeader->eNalUnitType);
+		pCtx->iErrorCode	= dsNoParamSets;
+		return NULL;
+	}
+	if ( (IS_VCL_NAL_AVC_BASE(pNalUnitHeader->eNalUnitType) && !(pCtx->bSpsExistAheadFlag || pCtx->bPpsExistAheadFlag)) || 
+		(IS_NEW_INTRODUCED_NAL(pNalUnitHeader->eNalUnitType) && !(pCtx->bSpsExistAheadFlag || pCtx->bSubspsExistAheadFlag || pCtx->bPpsExistAheadFlag) ) )
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, "ParseNalHeader(), no exist Parameter Sets ahead of sequence when try to decode slice(type:%d).\n", pNalUnitHeader->eNalUnitType);
+		pCtx->iErrorCode	|= dsNoParamSets;
+		return NULL;
+	}
+	
+
+	switch(pNalUnitHeader->eNalUnitType){
+	case NAL_UNIT_SEI:
+			
+		if ( pCtx->pAccessUnitList->uiAvailUnitsNum > 0 )
+		{
+			pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
+			pCtx->bAuReadyFlag = true;
+		}
+
+		break;
+	
+	case NAL_UNIT_SPS:	
+		
+		if ( pCtx->pAccessUnitList->uiAvailUnitsNum > 0 )
+		{
+			pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
+			pCtx->bAuReadyFlag = true;				
+		}
+			
+		break;
+
+	case NAL_UNIT_PREFIX:
+		pCurNal = &pCtx->sPrefixNal;
+
+		if ( iNalSize < NAL_UNIT_HEADER_EXT_SIZE )
+		{
+			return NULL;
+		}
+
+		DecodeNalHeaderExt( pCurNal, pNal );
+		
+		pNal            += NAL_UNIT_HEADER_EXT_SIZE;
+		iNalSize        -= NAL_UNIT_HEADER_EXT_SIZE;
+		*pConsumedBytes += NAL_UNIT_HEADER_EXT_SIZE;
+
+		pCurNal->sNalHeaderExt.sNalUnitHeader.uiForbiddenZeroBit = pNalUnitHeader->uiForbiddenZeroBit;
+		pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc		  = pNalUnitHeader->uiNalRefIdc;
+		pCurNal->sNalHeaderExt.sNalUnitHeader.eNalUnitType	      = pNalUnitHeader->eNalUnitType;
+
+		pBs = &pCtx->sBs;
+		
+		iBitSize = (iNalSize<<3) - BsGetTrailingBits( pNal + iNalSize - 1 ); // convert into bit
+		
+		InitBits( pBs, pNal, iBitSize);
+
+		ParsePrefixNalUnit( pCtx, pBs );
+		
+		break;
+	case NAL_UNIT_CODED_SLICE_EXT:
+		bExtensionFlag = true;
+	case NAL_UNIT_CODED_SLICE:
+	case NAL_UNIT_CODED_SLICE_IDR:
+		{
+			PAccessUnit pCurAu		= NULL;
+			uint32_t uiAvailNalNum;
+			pCurNal = MemGetNextNal( &pCtx->pAccessUnitList );
+			if( NULL == pCurNal )
+			{
+				WelsLog( pCtx, WELS_LOG_WARNING, "MemGetNextNal() fail due out of memory.\n");
+				pCtx->iErrorCode	|= dsOutOfMemory;
+				return NULL;
+			}
+
+			pCurNal->sNalHeaderExt.sNalUnitHeader.uiForbiddenZeroBit = pNalUnitHeader->uiForbiddenZeroBit;
+			pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc		  = pNalUnitHeader->uiNalRefIdc;
+			pCurNal->sNalHeaderExt.sNalUnitHeader.eNalUnitType	  = pNalUnitHeader->eNalUnitType;
+			pCurAu	      = pCtx->pAccessUnitList;
+			uiAvailNalNum = pCurAu->uiAvailUnitsNum;
+			
+
+			if( pNalUnitHeader->eNalUnitType == NAL_UNIT_CODED_SLICE_EXT )
+			{	
+				if ( iNalSize < NAL_UNIT_HEADER_EXT_SIZE )
+				{
+					return NULL;
+				}
+
+				DecodeNalHeaderExt( pCurNal, pNal );
+                if( pCurNal->sNalHeaderExt.uiQualityId != 0 ||
+                    pCurNal->sNalHeaderExt.bUseRefBasePicFlag )
+                {
+                    if( pCurNal->sNalHeaderExt.uiQualityId != 0 )
+                        WelsLog(pCtx, WELS_LOG_WARNING, "ParseNalHeader():uiQualityId (%d) != 0, MGS not supported!\n", pCurNal->sNalHeaderExt.uiQualityId);
+                    if( pCurNal->sNalHeaderExt.bUseRefBasePicFlag != 0 )
+                        WelsLog(pCtx, WELS_LOG_WARNING, "ParseNalHeader():bUseRefBasePicFlag (%d) != 0, MGS not supported!\n", pCurNal->sNalHeaderExt.bUseRefBasePicFlag);
+
+                    pCtx->iErrorCode |= dsInvalidArgument;
+				    ForceClearCurrentNal( pCurAu );
+
+				    if ( uiAvailNalNum > 1 )
+				    {
+					    pCurAu->uiEndPos = uiAvailNalNum - 2;
+					    pCtx->bAuReadyFlag = true;
+				    }
+                    return NULL;
+                }
+				pNal            += NAL_UNIT_HEADER_EXT_SIZE;
+				iNalSize        -= NAL_UNIT_HEADER_EXT_SIZE;
+				*pConsumedBytes += NAL_UNIT_HEADER_EXT_SIZE;
+				
+			}
+			else
+			{	
+
+				
+				if ( NAL_UNIT_PREFIX == pCtx->sPrefixNal.sNalHeaderExt.sNalUnitHeader.eNalUnitType )
+				{
+					PrefetchNalHeaderExtSyntax( pCtx, pCurNal, &pCtx->sPrefixNal );
+				}	
+
+				pCurNal->sNalHeaderExt.bIdrFlag = ( NAL_UNIT_CODED_SLICE_IDR == pNalUnitHeader->eNalUnitType ) ? true : false; //SHOULD update this flag for AVC if no prefix NAL
+				pCurNal->sNalHeaderExt.iNoInterLayerPredFlag = 1;
+			}		
+						
+			pBs = &pCurAu->pNalUnitsList[uiAvailNalNum-1]->sNalData.sVclNal.sSliceBitsRead;
+			iBitSize = (iNalSize<<3) - BsGetTrailingBits( pNal+ iNalSize - 1 ); // convert into bit
+			InitBits( pBs, pNal, iBitSize);
+			iErr = ParseSliceHeaderSyntaxs( pCtx, pBs, bExtensionFlag );
+			if ( iErr != ERR_NONE )
+			{
+				//if current NAL occur error when parsing, should clean it from pNalUnitsList
+				//otherwise, when Next good NAL decoding, this corrupt NAL is considered as normal NAL and lead to decoder crash		
+				ForceClearCurrentNal( pCurAu );
+
+				if ( uiAvailNalNum > 1 )
+				{
+					pCurAu->uiEndPos = uiAvailNalNum - 2;
+					pCtx->bAuReadyFlag = true;
+					
+
+				}
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID			
+				if (  dsNoParamSets & pCtx->iErrorCode )
+				{
+					if ( uiAvailNalNum <= 1 ) //no any data to decode and SPS/PPS ID mismatch, SHOULD request IDR
+					{
+#ifdef LONG_TERM_REF
+						pCtx->bParamSetsLostFlag = true;
+#else
+						pCtx->bReferenceLostAtT0Flag = true;
+#endif
+						ResetParameterSetsState( pCtx );
+					}
+					return NULL;
+				}
+				else
+				{
+					return NULL;
+				}
+#else
+				return NULL;
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+			}
+
+			if ( (uiAvailNalNum > 1) &&
+                CheckAccessUnitBoundary(	pCurAu->pNalUnitsList[uiAvailNalNum-1], pCurAu->pNalUnitsList[uiAvailNalNum-2], 
+				pCurAu->pNalUnitsList[uiAvailNalNum-1]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps) )
+			{
+				pCurAu->uiEndPos = uiAvailNalNum - 2;
+				pCtx->bAuReadyFlag = true;
+				
+
+			}	
+		}
+		break;
+	default:
+		break;
+	}  
+	
+	return pNal;
+}	
+
+
+bool_t CheckAccessUnitBoundaryExt( PNalUnitHeaderExt pLastNalHdrExt, PNalUnitHeaderExt pCurNalHeaderExt, PSliceHeader pLastSliceHeader, PSliceHeader pCurSliceHeader )
+{
+	const PSps kpSps = pCurSliceHeader->pSps;
+
+	//Sub-clause 7.1.4.1.1 temporal_id  
+	if (pLastNalHdrExt->uiTemporalId != pCurNalHeaderExt->uiTemporalId) {
+		return TRUE;
+	}
+
+	// Subclause 7.4.1.2.5
+	if ( pLastSliceHeader->iRedundantPicCnt < pCurSliceHeader->iRedundantPicCnt )
+		return FALSE;
+	else if ( pLastSliceHeader->iRedundantPicCnt > pCurSliceHeader->iRedundantPicCnt )
+		return TRUE;
+
+	// Subclause G7.4.1.2.4
+	if ( pLastNalHdrExt->uiDependencyId < pCurNalHeaderExt->uiDependencyId )
+		return FALSE;
+	else if ( pLastNalHdrExt->uiDependencyId > pCurNalHeaderExt->uiDependencyId )
+		return TRUE;
+	if ( pLastNalHdrExt->uiQualityId < pCurNalHeaderExt->uiQualityId )
+		return FALSE;
+	else if ( pLastNalHdrExt->uiQualityId > pCurNalHeaderExt->uiQualityId )
+		return TRUE;
+
+	// Subclause 7.4.1.2.4
+	if ( pLastSliceHeader->iFrameNum != pCurSliceHeader->iFrameNum )
+		return TRUE;
+	if ( pLastSliceHeader->iPpsId != pCurSliceHeader->iPpsId )
+		return TRUE;
+	if ( pLastSliceHeader->bFieldPicFlag != pCurSliceHeader->bFieldPicFlag )
+		return TRUE;
+	if ( pLastSliceHeader->bBottomFiledFlag != pCurSliceHeader->bBottomFiledFlag )
+		return TRUE;
+	if ( (pLastNalHdrExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) !=  (pCurNalHeaderExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) )
+		return TRUE;
+	if ( pLastNalHdrExt->bIdrFlag != pCurNalHeaderExt->bIdrFlag )
+		return TRUE;
+	if ( pCurNalHeaderExt->bIdrFlag ){
+		if ( pLastSliceHeader->uiIdrPicId != pCurSliceHeader->uiIdrPicId )
+			return TRUE;
+	}
+	if ( kpSps->uiPocType == 0 ){
+		if ( pLastSliceHeader->iPicOrderCntLsb != pCurSliceHeader->iPicOrderCntLsb )
+			return TRUE;
+		if ( pLastSliceHeader->iDeltaPicOrderCntBottom != pCurSliceHeader->iDeltaPicOrderCntBottom )
+			return TRUE;
+	}
+	else if ( kpSps->uiPocType == 1 ){
+		if ( pLastSliceHeader->iDeltaPicOrderCnt[0] != pCurSliceHeader->iDeltaPicOrderCnt[0] )
+			return TRUE;
+		if ( pLastSliceHeader->iDeltaPicOrderCnt[1] != pCurSliceHeader->iDeltaPicOrderCnt[1] )
+			return TRUE;
+	}
+
+	return FALSE;
+}	 
+
+
+bool_t CheckAccessUnitBoundary( const PNalUnit kpCurNal, const PNalUnit kpLastNal, const PSps kpSps )
+{
+	const PNalUnitHeaderExt kpLastNalHeaderExt = &kpLastNal->sNalHeaderExt;
+	const PNalUnitHeaderExt kpCurNalHeaderExt = &kpCurNal->sNalHeaderExt;
+	const SSliceHeader *kpLastSliceHeader = &kpLastNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+	const SSliceHeader *kpCurSliceHeader = &kpCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+
+	//Sub-clause 7.1.4.1.1 temporal_id  
+	if (kpLastNalHeaderExt->uiTemporalId != kpCurNalHeaderExt->uiTemporalId) {
+		return TRUE;
+	}
+
+	// Subclause 7.4.1.2.5
+	if ( kpLastSliceHeader->iRedundantPicCnt < kpCurSliceHeader->iRedundantPicCnt )
+		return FALSE;
+	else if ( kpLastSliceHeader->iRedundantPicCnt > kpCurSliceHeader->iRedundantPicCnt )
+		return TRUE;
+
+	// Subclause G7.4.1.2.4
+	if ( kpLastNalHeaderExt->uiDependencyId < kpCurNalHeaderExt->uiDependencyId )
+		return FALSE;
+	else if ( kpLastNalHeaderExt->uiDependencyId > kpCurNalHeaderExt->uiDependencyId )
+		return TRUE;
+	if ( kpLastNalHeaderExt->uiQualityId < kpCurNalHeaderExt->uiQualityId )
+		return FALSE;
+	else if ( kpLastNalHeaderExt->uiQualityId > kpCurNalHeaderExt->uiQualityId )
+		return TRUE;
+
+	// Subclause 7.4.1.2.4
+	if ( kpLastSliceHeader->iFrameNum != kpCurSliceHeader->iFrameNum )
+		return TRUE;
+	if ( kpLastSliceHeader->iPpsId != kpCurSliceHeader->iPpsId )
+		return TRUE;
+	if ( kpLastSliceHeader->bFieldPicFlag != kpCurSliceHeader->bFieldPicFlag )
+		return TRUE;
+	if ( kpLastSliceHeader->bBottomFiledFlag != kpCurSliceHeader->bBottomFiledFlag )
+		return TRUE;
+	if ( (kpLastNalHeaderExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) !=  (kpCurNalHeaderExt->sNalUnitHeader.uiNalRefIdc != NRI_PRI_LOWEST) )
+		return TRUE;
+	if ( kpLastNalHeaderExt->bIdrFlag != kpCurNalHeaderExt->bIdrFlag )
+		return TRUE;
+	if ( kpCurNalHeaderExt->bIdrFlag ){
+		if ( kpLastSliceHeader->uiIdrPicId != kpCurSliceHeader->uiIdrPicId )
+			return TRUE;
+	}
+	if ( kpSps->uiPocType == 0 ){
+		if ( kpLastSliceHeader->iPicOrderCntLsb != kpCurSliceHeader->iPicOrderCntLsb )
+			return TRUE;
+		if ( kpLastSliceHeader->iDeltaPicOrderCntBottom != kpCurSliceHeader->iDeltaPicOrderCntBottom )
+			return TRUE;
+	}
+	else if ( kpSps->uiPocType == 1 ){
+		if ( kpLastSliceHeader->iDeltaPicOrderCnt[0] != kpCurSliceHeader->iDeltaPicOrderCnt[0] )
+			return TRUE;
+		if ( kpLastSliceHeader->iDeltaPicOrderCnt[1] != kpCurSliceHeader->iDeltaPicOrderCnt[1] )
+			return TRUE;
+	}
+
+	return FALSE;
+}	 
+
+/*! 
+ *************************************************************************************
+ * \brief	to parse NON VCL NAL Units
+ *
+ * \param 	pCtx		decoder context
+ * \param	rbsp		rbsp buffer of NAL Unit
+ * \param	src_len		length of rbsp buffer
+ *
+ * \return	0 - successed
+ *	    	1 - failed
+ *
+ *************************************************************************************
+ */
+int32_t ParseNonVclNal( PWelsDecoderContext pCtx, uint8_t *pRbsp, const int32_t kiSrcLen )
+{
+	PBitStringAux	pBs = NULL;	
+	ENalUnitType eNalType	= NAL_UNIT_UNSPEC_0; // make initial value as unspecified
+	int32_t iPicWidth		= 0;
+	int32_t iPicHeight		= 0;
+	int32_t iBitSize		= 0;
+	int32_t iErr				= ERR_NONE;	
+
+	pBs	     = &pCtx->sBs;	// SBitStringAux instance for non VCL NALs decoding
+	iBitSize = (kiSrcLen<<3) - BsGetTrailingBits( pRbsp + kiSrcLen - 1 ); // convert into bit		
+	eNalType = pCtx->sCurNalHead.eNalUnitType;
+
+	switch( eNalType ) {	
+		case NAL_UNIT_SPS:
+		case NAL_UNIT_SUBSET_SPS:
+			if ( iBitSize > 0 )
+				InitBits( pBs, pRbsp, iBitSize );
+#ifdef DEBUG_PARSE_INFO
+			WelsLog(pCtx, WELS_LOG_INFO, "parsing nal: %d \n", eNalType);
+#endif
+			iErr = ParseSps( pCtx, pBs, &iPicWidth, &iPicHeight );
+			if ( ERR_NONE != iErr )	// modified for pSps/pSubsetSps invalid, 12/1/2009 
+			{
+				pCtx->iErrorCode |= dsNoParamSets;
+				return iErr;
+			}
+
+			if ( ERR_NONE == iErr )
+				UpdateMaxPictureResolution( pCtx, iPicWidth, iPicHeight );
+			
+			break;		
+
+		case NAL_UNIT_PPS:
+			if ( iBitSize > 0 )
+				InitBits( pBs, pRbsp, iBitSize );
+#ifdef DEBUG_PARSE_INFO
+			WelsLog(pCtx, WELS_LOG_INFO, "parsing nal: %d \n", eNalType);
+#endif
+			iErr = ParsePps( pCtx, &pCtx->sPpsBuffer[0], pBs );
+			if ( ERR_NONE != iErr )	// modified for pps invalid, 12/1/2009 
+			{
+				pCtx->iErrorCode |= dsNoParamSets;
+				return iErr;
+			}
+
+			pCtx->bPpsExistAheadFlag	= true;
+
+			break;
+
+		case NAL_UNIT_SEI:
+
+			break;
+
+		case NAL_UNIT_PREFIX:
+			break;		
+		case NAL_UNIT_CODED_SLICE_DPA:
+		case NAL_UNIT_CODED_SLICE_DPB:
+		case NAL_UNIT_CODED_SLICE_DPC:
+
+			break;
+
+		default:
+			break;		
+	}
+
+	return iErr;
+}
+
+void_t ParseRefBasePicMarking ( PBitStringAux pBs, PRefBasePicMarking pRefBasePicMarking )
+{	
+	const bool_t kbAdaptiveMarkingModeFlag = !!BsGetOneBit( pBs );
+	pRefBasePicMarking->bAdaptiveRefBasePicMarkingModeFlag = kbAdaptiveMarkingModeFlag;
+	if ( kbAdaptiveMarkingModeFlag ){
+		int32_t iIdx = 0;
+		do {
+			const uint32_t kuiMmco = BsGetUe( pBs );
+
+			pRefBasePicMarking->mmco_base[iIdx].uiMmcoType	= kuiMmco;
+
+			if (kuiMmco == MMCO_END)
+				break;
+
+			if (kuiMmco == MMCO_SHORT2UNUSED){
+				pRefBasePicMarking->mmco_base[iIdx].uiDiffOfPicNums	= 1 + BsGetUe( pBs );
+				pRefBasePicMarking->mmco_base[iIdx].iShortFrameNum	= 0;
+			}
+			else if (kuiMmco == MMCO_LONG2UNUSED){
+				pRefBasePicMarking->mmco_base[iIdx].uiLongTermPicNum	= BsGetUe( pBs );
+			}
+			++ iIdx;
+		} while(iIdx < MAX_MMCO_COUNT);
+	}
+}
+
+void_t ParsePrefixNalUnit ( PWelsDecoderContext pCtx, PBitStringAux pBs )
+{
+	PNalUnit pCurNal = &pCtx->sPrefixNal;
+
+	if ( pCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc != 0 ){
+		PNalUnitHeaderExt head_ext = &pCurNal->sNalHeaderExt;
+		PPrefixNalUnit sPrefixNal = &pCurNal->sNalData.sPrefixNal;
+		sPrefixNal->bStoreRefBasePicFlag	= !!BsGetOneBit( pBs );
+		if ( (head_ext->bUseRefBasePicFlag || sPrefixNal->bStoreRefBasePicFlag) && !head_ext->bIdrFlag )
+		{
+			ParseRefBasePicMarking ( pBs, &sPrefixNal->sRefPicBaseMarking );
+		}
+		sPrefixNal->bPrefixNalUnitAdditionalExtFlag	= !!BsGetOneBit( pBs );
+		if ( sPrefixNal->bPrefixNalUnitAdditionalExtFlag ){
+			sPrefixNal->bPrefixNalUnitExtFlag	= !!BsGetOneBit( pBs );
+		}
+	}	
+}
+
+
+int32_t DecodeSpsSvcExt( PWelsDecoderContext pCtx, PSubsetSps pSpsExt, PBitStringAux pBs )
+{	
+	PSpsSvcExt  pExt			= NULL;
+	uint8_t uiChromaArrayType	= 1;
+
+	pExt	= &pSpsExt->sSpsSvcExt;
+	
+	pExt->bInterLayerDeblockingFilterCtrlPresentFlag	= !!BsGetOneBit( pBs );
+	pExt->uiExtendedSpatialScalability						= BsGetBits( pBs, 2 );
+	if ( pExt->uiExtendedSpatialScalability > 2 )
+    {
+        WelsLog(pCtx, WELS_LOG_WARNING, "DecodeSpsSvcExt():extended_spatial_scalability (%d) != 0, ESS not supported!\n", pExt->uiExtendedSpatialScalability);
+		return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_INVALID_ESS);
+    }
+	
+	pExt->uiChromaPhaseXPlus1Flag	= 0;	// FIXME: Incoherent with JVT X201 standard (= 1), but conformance to JSVM (= 0) implementation.
+	pExt->uiChromaPhaseYPlus1		= 1;
+	uiChromaArrayType = pSpsExt->sSps.uiChromaArrayType;
+
+    pExt->uiChromaPhaseXPlus1Flag	= BsGetOneBit( pBs );
+    pExt->uiChromaPhaseYPlus1		= BsGetBits( pBs, 2 );
+	
+	pExt->uiSeqRefLayerChromaPhaseXPlus1Flag	= pExt->uiChromaPhaseXPlus1Flag;
+	pExt->uiSeqRefLayerChromaPhaseYPlus1		= pExt->uiChromaPhaseYPlus1;
+	memset(&pExt->sSeqScaledRefLayer, 0, sizeof(SPosOffset));
+
+    if ( pExt->uiExtendedSpatialScalability == 1 ){
+		SPosOffset* const kpPos = &pExt->sSeqScaledRefLayer;
+		pExt->uiSeqRefLayerChromaPhaseXPlus1Flag	= BsGetOneBit( pBs );
+		pExt->uiSeqRefLayerChromaPhaseYPlus1		= BsGetBits( pBs, 2 );
+
+        kpPos->iLeftOffset	= BsGetSe( pBs );
+		kpPos->iTopOffset	= BsGetSe( pBs );
+		kpPos->iRightOffset	= BsGetSe( pBs );
+		kpPos->iBottomOffset= BsGetSe( pBs );
+	}
+	
+	pExt->bSeqTCoeffLevelPredFlag	= !!BsGetOneBit( pBs );
+	pExt->bAdaptiveTCoeffLevelPredFlag	= false;
+	if ( pExt->bSeqTCoeffLevelPredFlag )
+		pExt->bAdaptiveTCoeffLevelPredFlag	= !!BsGetOneBit( pBs );
+	pExt->bSliceHeaderRestrictionFlag	= !!BsGetOneBit( pBs );
+
+
+	
+	return 0;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	to parse Sequence Parameter Set (SPS)
+ *
+ * \param	pCtx		Decoder context
+ * \param	pBsAux		bitstream reader auxiliary 
+ * \param	pPicWidth	picture width current Sps represented
+ * \param	pPicHeight	picture height current Sps represented
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is SPS.
+ *************************************************************************************
+ */
+
+
+int32_t ParseSps( PWelsDecoderContext pCtx, PBitStringAux pBsAux, int32_t *pPicWidth, int32_t *pPicHeight  )
+{
+	PBitStringAux pBs		= pBsAux;
+	PSps pSps				= NULL;
+	PSubsetSps pSubsetSps	= NULL;
+	SNalUnitHeader *pNalHead= &pCtx->sCurNalHead;
+	ProfileIdc	uiProfileIdc;
+	uint8_t	uiLevelIdc;
+	int32_t iSpsId;
+	bool_t bConstraintSetFlags[6] = { false };
+	const bool_t kbUseSubsetFlag   = IS_SUBSET_SPS_NAL(pNalHead->eNalUnitType);
+
+	
+	if ( kbUseSubsetFlag )	// SubsetSps
+	{
+		pCtx->bSubspsExistAheadFlag	= true;
+	}
+	else	// Sps
+	{
+		pCtx->bSpsExistAheadFlag		= true;
+
+		// added for EC, 10/28/2009		
+		// for safe
+		memset( &pCtx->bSpsAvailFlags[0], 0, sizeof(pCtx->bSpsAvailFlags) );
+		memset( &pCtx->bSubspsAvailFlags[0], 0, sizeof(pCtx->bSubspsAvailFlags) );
+		memset( &pCtx->bPpsAvailFlags[0], 0, sizeof(pCtx->bPpsAvailFlags) );
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+		pCtx->iSpsTotalNum    = 0;
+		pCtx->iSubspsTotalNum = 0;
+		pCtx->iPpsTotalNum    = 0;
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID		
+	}
+
+	uiProfileIdc	= BsGetBits( pBs, 8 );
+	bConstraintSetFlags[0]	= !!BsGetOneBit( pBs );	// constraint_set0_flag
+	bConstraintSetFlags[1]	= !!BsGetOneBit( pBs );	// constraint_set1_flag
+	bConstraintSetFlags[2]	= !!BsGetOneBit( pBs );	// constraint_set2_flag
+	bConstraintSetFlags[3]	= !!BsGetOneBit( pBs );	// constraint_set3_flag
+	bConstraintSetFlags[4]	= !!BsGetOneBit( pBs );	// constraint_set4_flag
+	bConstraintSetFlags[5]	= !!BsGetOneBit( pBs );	// constraint_set5_flag
+	BsGetBits( pBs, 2 );							// reserved_zero_2bits, equal to 0
+	uiLevelIdc	= BsGetBits( pBs, 8  );				// level_idc
+
+	iSpsId		= BsGetUe( pBs  );					// seq_parameter_set_id
+	
+		
+	if ( iSpsId >= MAX_SPS_COUNT || iSpsId < 0 )	// Modified to check invalid negative iSpsId, 12/1/2009
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, " iSpsId is out of range! \n");
+		return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_SPS_ID_OVERFLOW);
+	}
+
+	if ( kbUseSubsetFlag )
+	{
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+		pSubsetSps = &pCtx->sSubsetSpsBuffer[pCtx->iSubspsTotalNum];
+		pCtx->bSubspsAvailFlags[pCtx->iSubspsTotalNum] = true;
+		
+		pSubsetSps->sSps.iSpsId = iSpsId;
+		pSps = &pSubsetSps->sSps;
+		++pCtx->iSubspsTotalNum;
+#else
+		pSubsetSps	= &pCtx->sSubsetSpsBuffer[iSpsId];
+		pSps		= &pSubsetSps->sSps;		
+		pCtx->bSubspsAvailFlags[iSpsId]	= true; // added for EC, 10/28/2009
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID			
+	}
+	else
+	{
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+		pSps = &pCtx->sSpsBuffer[pCtx->iSpsTotalNum];
+		pCtx->bSpsAvailFlags[pCtx->iSpsTotalNum] = true;
+		
+		pSps->iSpsId = iSpsId;
+		++pCtx->iSpsTotalNum;
+#else
+		pSps = &pCtx->sSpsBuffer[iSpsId];		
+		pCtx->bSpsAvailFlags[iSpsId] = true; // added for EC, 10/28/2009
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID		
+	}
+
+	// syntax elements in default
+	pSps->uiChromaFormatIdc	= 1;
+	pSps->uiBitDepthLuma		=
+	pSps->uiBitDepthChroma	= 8; 
+	
+	pSps->uiProfileIdc	= uiProfileIdc;
+	pSps->uiLevelIdc	= uiLevelIdc;
+	pSps->iSpsId		= iSpsId;
+
+	if ( PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc ||
+		PRO_HIGH == uiProfileIdc || PRO_HIGH10 == uiProfileIdc ||
+		PRO_HIGH422 == uiProfileIdc || PRO_HIGH444 == uiProfileIdc ||
+		PRO_CAVLC444 == uiProfileIdc || 44 == uiProfileIdc ){
+		
+		pSps->uiChromaFormatIdc = BsGetUe( pBs );	
+        if( pSps->uiChromaFormatIdc != 1 )
+        {
+            WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): chroma_format_idc (%d) = 1 supported.\n", pSps->uiChromaFormatIdc);
+            return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);       
+        }
+        pSps->uiChromaArrayType = pSps->uiChromaFormatIdc;
+		pSps->uiBitDepthLuma		= 8 + BsGetUe( pBs );
+        if( pSps->uiBitDepthLuma != 8 )
+        {
+            WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): bit_depth_luma (%d) Only 8 bit supported.\n", pSps->uiBitDepthLuma);
+            return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);       
+        }
+		
+		pSps->uiBitDepthChroma	= 8 + BsGetUe( pBs );
+        if( pSps->uiBitDepthChroma != 8 )
+        {
+            WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): bit_depth_chroma (%d). Only 8 bit supported.\n", pSps->uiBitDepthChroma);
+            return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);       
+        }
+		pSps->bQpPrimeYZeroTransfBypassFlag	= !!BsGetOneBit( pBs );
+		pSps->bSeqScalingMatrixPresentFlag	= !!BsGetOneBit( pBs );
+		
+		if ( pSps->bSeqScalingMatrixPresentFlag ){	// For high profile, it is not used in current application. FIXME
+            WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): seq_scaling_matrix_present_flag (%d). Feature not supported.\n", pSps->bSeqScalingMatrixPresentFlag);
+            return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_NON_BASELINE);       
+		}
+	}
+
+	pSps->uiLog2MaxFrameNum	= 4 + BsGetUe( pBs );	// log2_max_frame_num_minus4
+	pSps->uiPocType			= BsGetUe( pBs );		// pic_order_cnt_type
+	
+	if ( 0 == pSps->uiPocType )
+	{
+		pSps->iLog2MaxPocLsb	= 4 + BsGetUe( pBs );	// log2_max_pic_order_cnt_lsb_minus4
+		
+	}
+	else if ( 1 == pSps->uiPocType )
+	{
+		int32_t i;
+		pSps->bDeltaPicOrderAlwaysZeroFlag	= !!BsGetOneBit( pBs );	// bDeltaPicOrderAlwaysZeroFlag
+		pSps->iOffsetForNonRefPic			= BsGetSe( pBs );		// iOffsetForNonRefPic
+		pSps->iOffsetForTopToBottomField	= BsGetSe( pBs );		// iOffsetForTopToBottomField
+		pSps->iNumRefFramesInPocCycle		= BsGetUe( pBs );	// num_ref_frames_in_pic_order_cnt_cycle
+		for( i = 0; i < pSps->iNumRefFramesInPocCycle; i++ )
+			pSps->iOffsetForRefFrame[ i ]	= BsGetSe( pBs );		// iOffsetForRefFrame[ i ]
+	}
+	if ( pSps->uiPocType > 2 )
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, " illegal pic_order_cnt_type: %d ! \n", pSps->uiPocType );
+		return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_INVALID_POC_TYPE);
+	}
+
+	pSps->iNumRefFrames	= BsGetUe( pBs );		// max_num_ref_frames
+	pSps->bGapsInFrameNumValueAllowedFlag	= !!BsGetOneBit( pBs );	// bGapsInFrameNumValueAllowedFlag
+	pSps->iMbWidth		= 1 + BsGetUe( pBs );		// pic_width_in_mbs_minus1
+	pSps->iMbHeight		= 1 + BsGetUe( pBs );		// pic_height_in_map_units_minus1
+	pSps->uiTotalMbCount	= pSps->iMbWidth * pSps->iMbHeight;
+	pSps->bFrameMbsOnlyFlag	= !!BsGetOneBit( pBs );	// frame_mbs_only_flag
+	
+	if ( !pSps->bFrameMbsOnlyFlag )
+    {
+        WelsLog( pCtx, WELS_LOG_WARNING, "ParseSps(): frame_mbs_only_flag (%d) not supported.\n", pSps->bFrameMbsOnlyFlag);
+		return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_MBAFF);
+    }
+	pSps->bDirect8x8InferenceFlag	= !!BsGetOneBit( pBs );	// direct_8x8_inference_flag
+	pSps->bFrameCroppingFlag		= !!BsGetOneBit( pBs );	// frame_cropping_flag
+	if ( pSps->bFrameCroppingFlag )
+	{
+		pSps->sFrameCrop.iLeftOffset	= BsGetUe( pBs );	// frame_crop_left_offset
+		pSps->sFrameCrop.iRightOffset	= BsGetUe( pBs );	// frame_crop_right_offset
+		pSps->sFrameCrop.iTopOffset		= BsGetUe( pBs );	// frame_crop_top_offset
+        pSps->sFrameCrop.iBottomOffset	= BsGetUe( pBs );	// frame_crop_bottom_offset
+	}
+	else
+	{
+		pSps->sFrameCrop.iLeftOffset	= 0;				// frame_crop_left_offset
+		pSps->sFrameCrop.iRightOffset	= 0;				// frame_crop_right_offset
+		pSps->sFrameCrop.iTopOffset		= 0;				// frame_crop_top_offset
+		pSps->sFrameCrop.iBottomOffset	= 0;				// frame_crop_bottom_offset
+	}
+	pSps->bVuiParamPresentFlag			= !!BsGetOneBit( pBs );	// vui_parameters_present_flag
+	
+	// Check if SPS SVC extension applicated
+	if ( kbUseSubsetFlag && ( PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc ) )
+	{
+		if ( DecodeSpsSvcExt( pCtx, pSubsetSps, pBs ) != ERR_NONE ){
+			return -1;
+		}
+		
+		pSubsetSps->bSvcVuiParamPresentFlag = !!BsGetOneBit( pBs );
+		if ( pSubsetSps->bSvcVuiParamPresentFlag ){
+		}
+	}
+
+
+	if ( PRO_SCALABLE_BASELINE == uiProfileIdc || PRO_SCALABLE_HIGH == uiProfileIdc )
+		pCtx->bAvcBasedFlag	= false;
+	else
+		pCtx->bAvcBasedFlag	= true;	// added for avc base pBs
+
+	*pPicWidth	= pSps->iMbWidth << 4;
+	*pPicHeight	= pSps->iMbHeight << 4;
+	
+	return 0;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	to parse Picture Parameter Set (PPS)
+ *
+ * \param	pCtx		Decoder context
+ * \param 	pPpsList	pps list
+ * \param	pBsAux		bitstream reader auxiliary 
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is PPS.
+ *************************************************************************************
+ */
+int32_t ParsePps( PWelsDecoderContext pCtx, PPps pPpsList, PBitStringAux pBsAux )
+{
+
+	PPps pPps = NULL;	
+	uint32_t uiPpsId = 0;
+    uint32_t iTmp;
+
+	uiPpsId = BsGetUe(pBsAux);	
+	if ( uiPpsId >= MAX_PPS_COUNT )
+	{
+		return ERR_INFO_PPS_ID_OVERFLOW;
+	}
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+	pPps = &pPpsList[pCtx->iPpsTotalNum];
+#else
+	pPps = &pPpsList[uiPpsId];
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID	
+	
+	
+	pPps->iPpsId = uiPpsId;
+	pPps->iSpsId = BsGetUe(pBsAux);
+	
+	if (pPps->iSpsId >= MAX_SPS_COUNT)
+	{
+		return ERR_INFO_SPS_ID_OVERFLOW;
+	}
+
+	pPps->bEntropyCodingModeFlag = !!BsGetOneBit(pBsAux);
+	pPps->bPicOrderPresentFlag   = !!BsGetOneBit(pBsAux);
+
+	pPps->uiNumSliceGroups = 1 + BsGetUe(pBsAux);
+
+	if (pPps->uiNumSliceGroups > MAX_SLICEGROUP_IDS)
+	{
+		return ERR_INFO_INVALID_SLICEGROUP;
+	}
+
+	if (pPps->uiNumSliceGroups > 1)
+	{
+		pPps->uiSliceGroupMapType = BsGetUe(pBsAux);
+        if( pPps->uiSliceGroupMapType > 1)
+        {
+            WelsLog( pCtx, WELS_LOG_WARNING, "ParsePps(): slice_group_map_type (%d): support only 0,1.\n", pPps->uiSliceGroupMapType);
+		    return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_FMOTYPE);
+        }
+		
+		switch(pPps->uiSliceGroupMapType)
+		{
+		case 0:
+			for (iTmp = 0; iTmp < pPps->uiNumSliceGroups; iTmp++)
+			{
+				pPps->uiRunLength[iTmp] = 1 + BsGetUe(pBsAux);
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	pPps->uiNumRefIdxL0Active = 1 + BsGetUe(pBsAux);
+	pPps->uiNumRefIdxL1Active = 1 + BsGetUe(pBsAux);
+
+	if (pPps->uiNumRefIdxL0Active > MAX_REF_PIC_COUNT ||
+		pPps->uiNumRefIdxL1Active > MAX_REF_PIC_COUNT) 
+	{
+		return ERR_INFO_REF_COUNT_OVERFLOW;
+	}
+	
+	pPps->bWeightedPredFlag  = !!BsGetOneBit(pBsAux);
+	pPps->uiWeightedBipredIdc = BsGetBits(pBsAux, 2);
+    if( pPps->bWeightedPredFlag || pPps->uiWeightedBipredIdc != 0 )
+    {
+        WelsLog( pCtx, WELS_LOG_WARNING, "ParsePps(): weighted_pred_flag (%d) weighted_bipred_idc (%d) neither supported.\n", pPps->bWeightedPredFlag, pPps->uiWeightedBipredIdc);
+        return GENERATE_ERROR_NO(ERR_LEVEL_PARAM_SETS, ERR_INFO_UNSUPPORTED_WP);
+    }
+
+	pPps->iPicInitQp = 26 + BsGetSe(pBsAux);
+	pPps->iPicInitQs = 26 + BsGetSe(pBsAux);
+
+	pPps->iChromaQpIndexOffset                  = BsGetSe(pBsAux);
+	pPps->bDeblockingFilterControlPresentFlag   = !!BsGetOneBit(pBsAux);
+	pPps->bConstainedIntraPredFlag              = !!BsGetOneBit(pBsAux);
+	pPps->bRedundantPicCntPresentFlag           = !!BsGetOneBit(pBsAux);	
+
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+	pCtx->bPpsAvailFlags[pCtx->iPpsTotalNum] = true;
+	++pCtx->iPpsTotalNum;
+#else	
+	pCtx->bPpsAvailFlags[uiPpsId] = true; // added for EC, 10/28/2009
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+
+	return ERR_NONE;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	to parse SEI message payload
+ *
+ * \param 	pSei		sei message to be parsed output
+ * \param	pBsAux		bitstream reader auxiliary 
+ *
+ * \return	0 - successed
+ *		1 - failed
+ *
+ * \note	Call it in case eNalUnitType is NAL_UNIT_SEI.
+ *************************************************************************************
+ */
+int32_t ParseSei( void_t *pSei, PBitStringAux pBsAux )	// reserved Sei_Msg type
+{
+	
+
+	return ERR_NONE;
+}
+
+/*!
+ *************************************************************************************
+ * \brief	reset fmo list due to got Sps now
+ *
+ * \param	pCtx	decoder context
+ *
+ * \return	count number of fmo context units are reset
+ *************************************************************************************
+ */
+int32_t ResetFmoList( PWelsDecoderContext pCtx )
+{
+	int32_t iCountNum = 0;
+	if ( NULL != pCtx )
+	{
+		// Fixed memory leak due to PPS_ID might not be continuous sometimes, 1/5/2010
+		UninitFmoList( &pCtx->sFmoList[0], MAX_PPS_COUNT, pCtx->iActiveFmoNum );
+		iCountNum	= pCtx->iActiveFmoNum;
+		pCtx->iActiveFmoNum	= 0;
+	}
+	return iCountNum;
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/bit_stream.cpp
@@ -1,0 +1,123 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	bit_stream.cpp
+ *
+ * \brief	Reading / writing bit-stream
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#include "bit_stream.h"
+#include "macros.h"
+
+namespace WelsDec {
+
+#ifdef WORDS_BIGENDIAN
+inline uint32_t EndianFix(uint32_t uiX)
+{
+	return uiX;
+}
+#else //WORDS_BIGENDIAN
+
+#ifdef _MSC_VER
+inline uint32_t EndianFix(uint32_t uiX)
+{
+	__asm
+	{
+		mov   eax,  uiX
+		bswap   eax
+		mov   uiX,    eax
+	}
+	return uiX;
+}
+#else  //_MSC_VER
+
+inline uint32_t EndianFix(uint32_t uiX)
+{
+#ifdef ARM_ARCHv7
+	__asm__ __volatile__("rev %0, %0":"+r"(uiX)); //Just for the ARMv7 
+#elif defined (X86_ARCH)
+	__asm__ __volatile__("bswap %0":"+r"(uiX));
+#else
+    uiX = ((uiX & 0xff000000)>> 24) | ((uiX & 0xff0000) >> 8) |
+        ((uiX & 0xff00) << 8) | ((uiX&0xff) << 24);
+#endif	
+	return uiX;
+}
+#endif //_MSC_VER
+
+#endif //WORDS_BIGENDIAN
+
+inline uint32_t GetValue4Bytes( uint8_t* pDstNal )
+{
+	uint32_t uiValue = 0;
+	uiValue = (pDstNal[0]<<24) | (pDstNal[1]<<16) | (pDstNal[2]<<8) | (pDstNal[3]);
+	return uiValue;
+}
+
+void_t InitReadBits( PBitStringAux pBitString )
+{
+	pBitString->uiCurBits  = GetValue4Bytes( pBitString->pCurBuf );
+	pBitString->pCurBuf  += 4;
+	pBitString->iLeftBits = -16;
+}
+
+/*!
+ * \brief	input bits for decoder or initialize bitstream writing in encoder
+ *
+ * \param	pBitString	Bit string auxiliary pointer
+ * \param	kpBuf		bit-stream buffer
+ * \param	kiSize	    size in bits for decoder; size in bytes for encoder
+ *
+ * \return	size of buffer data in byte; failed in -1 return
+ */
+int32_t InitBits( PBitStringAux pBitString, const uint8_t *kpBuf, const int32_t kiSize )
+{	
+	const int32_t kiSizeBuf = (kiSize + 7) >> 3;
+	uint8_t *pTmp = (uint8_t *)kpBuf;
+
+	if ( NULL == pTmp )
+		return -1;
+
+	pBitString->pStartBuf   = pTmp;				// buffer to start position
+	pBitString->pEndBuf	    = pTmp + kiSizeBuf;	// buffer + length
+	pBitString->iBits	    = kiSize;				// count bits of overall bitstreaming inputindex;
+
+	pBitString->pCurBuf   = pBitString->pStartBuf;
+	InitReadBits( pBitString );
+
+	return kiSizeBuf;
+}
+
+} // namespace WelsDec
+
--- /dev/null
+++ b/codec/decoder/core/src/cpu.cpp
@@ -1,0 +1,211 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu.cpp
+ *
+ * \brief	CPU compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+
+#include "cpu.h"
+#include "cpu_core.h"
+
+namespace WelsDec {
+
+#define    CPU_Vender_AMD    "AuthenticAMD"
+#define    CPU_Vender_INTEL  "GenuineIntel"
+#define    CPU_Vender_CYRIX  "CyrixInstead"
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors )
+{
+    uint32_t uiCPU = 0;	
+    uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+	int32_t  CacheLineSize = 0;
+	int8_t   chVenderName[16] = { 0 };	
+	
+    if( !WelsCPUIdVerify() )
+    {
+        /* cpuid is not supported in cpu */
+        return 0;
+    }
+	
+	WelsCPUId( 0, &uiFeatureA, (uint32_t*)&chVenderName[0],(uint32_t*)&chVenderName[8],(uint32_t*)&chVenderName[4] );
+    if( uiFeatureA == 0 )
+    {
+		/* maximum input value for basic cpuid information */
+        return 0;
+    }
+	
+	WelsCPUId( 1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+    if( (uiFeatureD & 0x00800000) == 0 )
+    {
+        /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+        return 0;
+    }
+	
+    uiCPU = WELS_CPU_MMX;
+    if( uiFeatureD & 0x02000000 )
+    {
+        /* SSE technology is identical to AMD MMX extensions */
+        uiCPU |= WELS_CPU_MMXEXT|WELS_CPU_SSE;
+    }
+    if( uiFeatureD & 0x04000000 )
+    {
+        /* SSE2 support here */
+        uiCPU |= WELS_CPU_SSE2;
+    }
+	if ( uiFeatureD & 0x00000001 )
+	{
+		/* x87 FPU on-chip checking */
+		uiCPU |= WELS_CPU_FPU;
+	}
+	if ( uiFeatureD & 0x00008000 )
+	{
+		/* CMOV instruction checking */
+		uiCPU |= WELS_CPU_CMOV;
+	}
+	if ( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) )	// confirmed_safe_unsafe_usage
+	{
+		if ( uiFeatureD & 0x10000000 )
+		{
+			/* Multi-Threading checking: contains of multiple logic processors */
+			uiCPU |= WELS_CPU_HTT;
+		}
+	}	
+
+	if( uiFeatureC & 0x00000001 ){
+		/* SSE3 support here */
+		uiCPU |= WELS_CPU_SSE3;
+	}
+	if( uiFeatureC & 0x00000200 ){
+		/* SSSE3 support here */
+		uiCPU |= WELS_CPU_SSSE3;
+	}
+	if( uiFeatureC & 0x00080000 ){
+		/* SSE4.1 support here, 45nm Penryn processor */
+		uiCPU |= WELS_CPU_SSE41; 
+	}
+	if( uiFeatureC & 0x00100000 ){
+		/* SSE4.2 support here, next generation Nehalem processor */
+		uiCPU |= WELS_CPU_SSE42;
+	}
+	if ( WelsCPUSupportAVX( uiFeatureA, uiFeatureC ) )
+	{
+		/* AVX supported */
+		uiCPU |= WELS_CPU_AVX;
+	}
+	if ( WelsCPUSupportFMA( uiFeatureA, uiFeatureC ) )
+	{
+		/* AVX FMA supported */
+		uiCPU |= WELS_CPU_FMA;
+	}
+	if ( uiFeatureC & 0x02000000 )
+	{
+		/* AES checking */
+		uiCPU |= WELS_CPU_AES;
+	}
+	if ( uiFeatureC & 0x00400000 )
+	{
+		/* MOVBE checking */
+		uiCPU |= WELS_CPU_MOVBE;
+	}
+
+	if ( pNumberOfLogicProcessors != NULL )
+	{
+		// HTT enabled on chip
+		*pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX		
+	}	
+	
+    WelsCPUId( 0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+
+	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_AMD)) && (uiFeatureA>=0x80000001) ){	// confirmed_safe_unsafe_usage
+		WelsCPUId(0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+		if( uiFeatureD&0x00400000 ){
+			uiCPU |= WELS_CPU_MMXEXT;
+		}
+		if( uiFeatureD&0x80000000 ){
+			uiCPU |= WELS_CPU_3DNOW;
+		}
+	}
+
+	if( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) ){	// confirmed_safe_unsafe_usage
+		int32_t  family, model;
+
+		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+		family = ((uiFeatureA>>8)&0xf) + ((uiFeatureA>>20)&0xff);
+        model  = ((uiFeatureA>>4)&0xf) + ((uiFeatureA>>12)&0xf0);
+
+		if( (family==6) && (model==9 || model==13 || model==14) ){
+			uiCPU &= ~(WELS_CPU_SSE2|WELS_CPU_SSE3);
+		}
+	}
+
+	// get cache line size
+	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_INTEL)) || !(strcmp((const str_t*)chVenderName,CPU_Vender_CYRIX)) ){	// confirmed_safe_unsafe_usage
+		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+		CacheLineSize = (uiFeatureB&0xff00)>>5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+		if( CacheLineSize == 128 ){
+			uiCPU |= WELS_CPU_CACHELINE_128;
+		}
+		else if( CacheLineSize == 64 ){
+			uiCPU |= WELS_CPU_CACHELINE_64;
+		}
+		else if( CacheLineSize == 32 ){
+			uiCPU |= WELS_CPU_CACHELINE_32;
+		}
+		else if( CacheLineSize == 16 ){
+			uiCPU |= WELS_CPU_CACHELINE_16;
+		}
+	}
+	
+    return uiCPU;
+}
+
+
+void WelsCPURestore( const uint32_t kuiCPU )
+{
+    if( kuiCPU & (WELS_CPU_MMX|WELS_CPU_MMXEXT|WELS_CPU_3DNOW|WELS_CPU_3DNOWEXT) )
+    {
+        WelsEmms();
+    }
+}
+
+#endif
+
+} // namespace WelsDec
--- /dev/null
+++ b/codec/decoder/core/src/deblocking.cpp
@@ -1,0 +1,1013 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	deblocking.c
+ *
+ * \brief	Interfaces introduced in frame deblocking filtering
+ *
+ * \date	08/02/2010
+ *           
+ *************************************************************************************
+ */
+
+#include "deblocking.h"
+#include "cpu_core.h"
+#include "fmo.h"
+
+namespace WelsDec {
+
+#define NO_SUPPORTED_FILTER_IDX     (-1)
+#define LEFT_FLAG_BIT 0
+#define TOP_FLAG_BIT 1
+#define LEFT_FLAG_MASK 0x01
+#define TOP_FLAG_MASK 0x02
+
+#define SAME_MB_DIFF_REFIDX
+#define g_kuiAlphaTable(x) g_kuiAlphaTable[(x)+12]
+#define g_kiBetaTable(x)  g_kiBetaTable[(x)+12]
+#define g_kiTc0Table(x)   g_kiTc0Table[(x)+12]
+
+#define MB_BS_MV(iRefIndex, iMotionVector, iMbXy, iMbBn, iIndex, iNeighIndex) \
+(\
+    ( iRefIndex[iMbXy][iIndex] - iRefIndex[iMbBn][iNeighIndex] )||\
+    ( WELS_ABS( iMotionVector[iMbXy][iIndex][0] - iMotionVector[iMbBn][iNeighIndex][0] ) >= 4 ) ||\
+    ( WELS_ABS( iMotionVector[iMbXy][iIndex][1] - iMotionVector[iMbBn][iNeighIndex][1] ) >= 4 )\
+)
+
+#if defined(SAME_MB_DIFF_REFIDX)
+#define SMB_EDGE_MV(iRefIndex, iMotionVector, iIndex, iNeighIndex) \
+(\
+    ( iRefIndex[iIndex] - iRefIndex[iNeighIndex] )||(\
+    ( WELS_ABS( iMotionVector[iIndex][0] - iMotionVector[iNeighIndex][0] ) &(~3) ) |\
+    ( WELS_ABS( iMotionVector[iIndex][1] - iMotionVector[iNeighIndex][1] ) &(~3) ))\
+)
+#else
+#define SMB_EDGE_MV(iRefIndex, iMotionVector, iIndex, iNeighIndex) \
+(\
+    !!(( WELS_ABS( iMotionVector[iIndex][0] - iMotionVector[iNeighIndex][0] ) &(~3) ) |( WELS_ABS( iMotionVector[iIndex][1] - iMotionVector[iNeighIndex][1] ) &(~3) ))\
+)
+#endif
+
+#define BS_EDGE(bsx1, iRefIndex, iMotionVector, iIndex, iNeighIndex) \
+( (bsx1|SMB_EDGE_MV(iRefIndex, iMotionVector, iIndex, iNeighIndex))<<((uint8_t)(!!bsx1)))
+
+#define GET_ALPHA_BETA_FROM_QP(iQp, iAlphaOffset, iBetaOffset, iIndex, iAlpha, iBeta) \
+{\
+	iIndex = (iQp + iAlphaOffset);\
+	iAlpha = g_kuiAlphaTable(iIndex);\
+	iBeta  = g_kiBetaTable((iQp + iBetaOffset));\
+}
+
+static const uint8_t g_kuiAlphaTable[52+24] = { //this table refers to Table 8-16 in H.264/AVC standard
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+	7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+	25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+	80, 90,101,113,127,144,162,182,203,226,
+	255, 255
+	,255, 255,255, 255,255, 255,255, 255,255, 255,255, 255
+};
+
+static const int8_t g_kiBetaTable[52+24] = { //this table refers to Table 8-16 in H.264/AVC standard
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+     3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+     8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+    13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+    18, 18
+    ,18, 18,18, 18,18, 18,18, 18,18, 18,18, 18
+};
+
+static const int8_t g_kiTc0Table[52+24][4] = { //this table refers Table 8-17 in H.264/AVC standard
+    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+    { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 1 },
+    { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 1, 1 }, { -1, 0, 1, 1 }, { -1, 1, 1, 1 },
+    { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 },
+    { -1, 1, 1, 2 }, { -1, 1, 2, 3 }, { -1, 1, 2, 3 }, { -1, 2, 2, 3 }, { -1, 2, 2, 4 }, { -1, 2, 3, 4 },
+    { -1, 2, 3, 4 }, { -1, 3, 3, 5 }, { -1, 3, 4, 6 }, { -1, 3, 4, 6 }, { -1, 4, 5, 7 }, { -1, 4, 5, 8 },
+    { -1, 4, 6, 9 }, { -1, 5, 7,10 }, { -1, 6, 8,11 }, { -1, 6, 8,13 }, { -1, 7,10,14 }, { -1, 8,11,16 },
+    { -1, 9,12,18 }, { -1, 10,13,20 }, {-1,11,15,23 }, { -1,13,17,25 }
+    ,{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 }
+	,{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 }
+};
+
+static const uint8_t g_kuiTableBIdx[2][8] =   
+{     
+	{0,  4,  8,  12, 
+	3,  7,  11, 15}, 
+
+	{0,  1,  2,  3 , 
+	12, 13, 14, 15}, 
+};
+
+#define TC0_TBL_LOOKUP(tc, iIndexA, pBS, bChroma) \
+{\
+	tc[0] = g_kiTc0Table(iIndexA)[pBS[0]] + bChroma;\
+	tc[1] = g_kiTc0Table(iIndexA)[pBS[1]] + bChroma;\
+	tc[2] = g_kiTc0Table(iIndexA)[pBS[2]] + bChroma;\
+	tc[3] = g_kiTc0Table(iIndexA)[pBS[3]] + bChroma;\
+}
+
+void_t inline DeblockingBSInsideMBAvsbase( int8_t* pNnzTab, uint8_t nBS[2][4][4], int32_t iLShiftFactor )
+{
+	uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
+	FORCE_STACK_ALIGN_1D( uint8_t, uiBsx3, 4, 4 );
+
+	uiNnz32b0 = *(uint32_t *)(pNnzTab+0);
+	uiNnz32b1 = *(uint32_t *)(pNnzTab+4);
+	uiNnz32b2 = *(uint32_t *)(pNnzTab+8);
+	uiNnz32b3 = *(uint32_t *)(pNnzTab+12);
+
+	*(uint32_t *)uiBsx3 = (uiNnz32b0|(uiNnz32b0>>8))<<iLShiftFactor;
+	nBS[0][1][0] = uiBsx3[0];
+	nBS[0][2][0] = uiBsx3[1];
+	nBS[0][3][0] = uiBsx3[2];
+
+	*(uint32_t *)uiBsx3 = (uiNnz32b1|(uiNnz32b1>>8))<<iLShiftFactor;
+	nBS[0][1][1] = uiBsx3[0];
+	nBS[0][2][1] = uiBsx3[1];
+	nBS[0][3][1] = uiBsx3[2];
+	*(uint32_t *)nBS[1][1] = (uiNnz32b0|uiNnz32b1)<<iLShiftFactor;
+
+	*(uint32_t *)uiBsx3 = (uiNnz32b2|(uiNnz32b2>>8))<<iLShiftFactor;
+	nBS[0][1][2] = uiBsx3[0];
+	nBS[0][2][2] = uiBsx3[1];
+	nBS[0][3][2] = uiBsx3[2];
+	*(uint32_t *)nBS[1][2] = (uiNnz32b1|uiNnz32b2)<<iLShiftFactor;
+
+	*(uint32_t *)uiBsx3 = (uiNnz32b3|(uiNnz32b3>>8))<<iLShiftFactor;
+	nBS[0][1][3] = uiBsx3[0];
+	nBS[0][2][3] = uiBsx3[1];
+	nBS[0][3][3] = uiBsx3[2];	
+	*(uint32_t *)nBS[1][3] = (uiNnz32b2|uiNnz32b3)<<iLShiftFactor;
+
+}
+
+void_t static inline DeblockingBSInsideMBNormal( PDqLayer pCurDqLayer, uint8_t nBS[2][4][4], int8_t* pNnzTab, int32_t iMbXy )
+{
+	uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
+    int8_t* iRefIndex = pCurDqLayer->pRefIndex[LIST_0][iMbXy];
+	FORCE_STACK_ALIGN_1D( uint8_t, uiBsx4, 4, 4 );
+
+	uiNnz32b0 = *(uint32_t *)(pNnzTab+0);
+	uiNnz32b1 = *(uint32_t *)(pNnzTab+4);
+	uiNnz32b2 = *(uint32_t *)(pNnzTab+8);
+	uiNnz32b3 = *(uint32_t *)(pNnzTab+12);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b0|(uiNnz32b0>>8));
+	nBS[0][1][0] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 1, 0);
+	nBS[0][2][0] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 2, 1);
+	nBS[0][3][0] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 3, 2);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b1|(uiNnz32b1>>8));
+	nBS[0][1][1] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 4);
+	nBS[0][2][1] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 5);
+	nBS[0][3][1] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 6);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b2|(uiNnz32b2>>8));
+	nBS[0][1][2] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 8);
+	nBS[0][2][2] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10,9);
+	nBS[0][3][2] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11,10);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b3|(uiNnz32b3>>8));
+	nBS[0][1][3] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13,12);
+	nBS[0][2][3] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14,13);
+	nBS[0][3][3] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15,14);
+
+	// horizontal
+	*(uint32_t *)uiBsx4 = (uiNnz32b0|uiNnz32b1);
+	nBS[1][1][0] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 4, 0);
+	nBS[1][1][1] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 1);
+	nBS[1][1][2] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 2);
+	nBS[1][1][3] = BS_EDGE(uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 3);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b1|uiNnz32b2);
+	nBS[1][2][0] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 8, 4);
+	nBS[1][2][1] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 5);
+	nBS[1][2][2] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 6);
+	nBS[1][2][3] = BS_EDGE(uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 7);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b2|uiNnz32b3);
+	nBS[1][3][0] = BS_EDGE(uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 12, 8);
+	nBS[1][3][1] = BS_EDGE(uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 9);
+	nBS[1][3][2] = BS_EDGE(uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 10);
+	nBS[1][3][3] = BS_EDGE(uiBsx4[3], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 11);
+}
+
+uint32_t DeblockingBsMarginalMBAvcbase( PDqLayer pCurDqLayer, int32_t iEdge, int32_t iNeighMb, int32_t iMbXy)
+{
+	int32_t i;
+	uint32_t uiBSx4;    
+    //uint8_t* bS = static_cast<uint8_t*>(&uiBSx4);
+    uint8_t* pBS = (uint8_t*)(&uiBSx4);
+	uint32_t uiBIdx  = *(uint32_t *)(&g_kuiTableBIdx[iEdge][0]); 
+	uint32_t uiBnIdx = *(uint32_t *)(&g_kuiTableBIdx[iEdge][4]);
+
+	for( i = 0; i < 4; i++ )
+	{
+		if( pCurDqLayer->pNzc[iMbXy][uiBIdx&0xff] | pCurDqLayer->pNzc[iNeighMb][uiBnIdx&0xff] )
+		{
+			pBS[i] = 2;
+		} 
+		else 
+		{
+			pBS[i] = MB_BS_MV(pCurDqLayer->pRefIndex[LIST_0], pCurDqLayer->pMv[LIST_0], iMbXy, iNeighMb, (uiBIdx&0xff), (uiBnIdx&0xff));
+		}
+		uiBIdx  = uiBIdx  >> 8;
+		uiBnIdx = uiBnIdx >> 8;
+	}
+    return uiBSx4;
+}
+int32_t DeblockingAvailableNoInterlayer( PDqLayer pCurDqLayer, int32_t iFilterIdc )
+{
+ 	int32_t iMbY = pCurDqLayer->iMbY;
+ 	int32_t iMbX = pCurDqLayer->iMbX;
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	BOOL_T bLeftFlag = FALSE;
+	BOOL_T bTopFlag  = FALSE;
+   
+	if ( 2 == iFilterIdc )
+	{
+		bLeftFlag = ( iMbX > 0 ) && ( pCurDqLayer->pSliceIdc[iMbXy] == pCurDqLayer->pSliceIdc[iMbXy-1] );
+		bTopFlag  = ( iMbY > 0 ) && ( pCurDqLayer->pSliceIdc[iMbXy] == pCurDqLayer->pSliceIdc[iMbXy-pCurDqLayer->iMbWidth] );
+	}
+	else //if ( 0 == iFilterIdc )
+	{
+		bLeftFlag = ( iMbX > 0 );
+		bTopFlag  = ( iMbY > 0 );
+	}
+	return (bLeftFlag<<LEFT_FLAG_BIT)|(bTopFlag<<TOP_FLAG_BIT);
+}
+
+void_t FilteringEdgeLumaH(SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
+{
+	int32_t iIndexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+	FORCE_STACK_ALIGN_1D( int8_t, tc, 4, 16 );
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(tc, iIndexA, pBS, 0);
+		pFilter->pLoopf->pfLumaDeblockingLT4Ver(pPix, iStride, iAlpha, iBeta, tc);
+	}
+	return;
+}
+
+
+void_t FilteringEdgeLumaV(SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
+{
+	int32_t  iIndexA;
+	int32_t  iAlpha;
+	int32_t  iBeta; 
+	FORCE_STACK_ALIGN_1D( int8_t, tc, 4, 16 );
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(tc, iIndexA, pBS, 0);
+		pFilter->pLoopf->pfLumaDeblockingLT4Hor(pPix, iStride, iAlpha, iBeta, tc);
+	}
+	return;
+}
+
+
+void_t FilteringEdgeLumaIntraH( SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
+{
+	int32_t iIndexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  	
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		pFilter->pLoopf->pfLumaDeblockingEQ4Ver(pPix, iStride, iAlpha, iBeta);
+	}
+	return;
+}
+
+void_t FilteringEdgeLumaIntraV( SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
+{
+	int32_t iIndexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{	
+		pFilter->pLoopf->pfLumaDeblockingEQ4Hor(pPix, iStride, iAlpha, iBeta);
+	}
+	return;
+}
+void_t FilteringEdgeChromaH( SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
+{	
+	int32_t iIndexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+	FORCE_STACK_ALIGN_1D( int8_t, tc, 4, 16 );
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(tc, iIndexA, pBS, 1);
+		pFilter->pLoopf->pfChromaDeblockingLT4Ver(pPixCb, pPixCr, iStride,iAlpha, iBeta, tc);
+	}
+	return;
+} 
+void_t FilteringEdgeChromaV( SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
+{	  
+	int32_t iIndexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+	FORCE_STACK_ALIGN_1D( int8_t, tc, 4, 16 );
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(tc, iIndexA, pBS, 1);
+		pFilter->pLoopf->pfChromaDeblockingLT4Hor(pPixCb, pPixCr, iStride, iAlpha, iBeta, tc);
+	}
+	return;
+}
+
+void_t FilteringEdgeChromaIntraH( SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
+{
+	int32_t iIndexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		pFilter->pLoopf->pfChromaDeblockingEQ4Ver(pPixCb, pPixCr, iStride, iAlpha, iBeta);
+	}
+	return;
+}
+
+void_t FilteringEdgeChromaIntraV( SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
+{
+	int32_t iIndexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		pFilter->pLoopf->pfChromaDeblockinEQ4Hor(pPixCb, pPixCr, iStride, iAlpha, iBeta);
+	}
+	return;
+}
+
+
+void_t DeblockingInterMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, uint8_t nBS[2][4][4], int32_t iBoundryFlag )
+{
+	int32_t iMbXyIndex = pCurDqLayer->iMbXyIndex;
+	int32_t iMbX = pCurDqLayer->iMbX;
+	int32_t iMbY = pCurDqLayer->iMbY;
+    
+	int32_t iCurLumaQp = pCurDqLayer->pLumaQp[iMbXyIndex];
+	int32_t iCurChromaQp = pCurDqLayer->pChromaQp[iMbXyIndex];
+	int32_t iLineSize   = pFilter->iCsStride[0];
+	int32_t iLineSizeUV = pFilter->iCsStride[1];
+
+    uint8_t * pDestY, * pDestCb, * pDestCr;
+	pDestY  = pFilter->pCsData[0] + ((iMbY * iLineSize + iMbX) << 4);
+	pDestCb = pFilter->pCsData[1] + ((iMbY * iLineSizeUV + iMbX) << 3);				
+	pDestCr = pFilter->pCsData[2] + ((iMbY * iLineSizeUV + iMbX) << 3);
+
+	if( iBoundryFlag & LEFT_FLAG_MASK)
+	{
+		int32_t iLeftXyIndex = iMbXyIndex - 1;
+		pFilter->iLumaQP   = (iCurLumaQp + pCurDqLayer->pLumaQp[iLeftXyIndex] + 1) >> 1;
+		pFilter->iChromaQP = (iCurChromaQp + pCurDqLayer->pChromaQp[iLeftXyIndex]+ 1) >> 1;
+
+		if( nBS[0][0][0] == 0x04 )
+		{
+			FilteringEdgeLumaIntraV( pFilter, pDestY, iLineSize, NULL );
+			FilteringEdgeChromaIntraV( pFilter, pDestCb, pDestCr, iLineSizeUV, NULL );
+		} 
+		else
+		{
+			if(*(uint32_t *)nBS[0][0] != 0)
+			{
+				FilteringEdgeLumaV( pFilter, pDestY, iLineSize, nBS[0][0] );
+				FilteringEdgeChromaV( pFilter, pDestCb, pDestCr, iLineSizeUV, nBS[0][0] );
+			}
+		}
+	}
+	
+	pFilter->iLumaQP = iCurLumaQp;
+	pFilter->iChromaQP = iCurChromaQp;
+    
+	if(*(uint32_t *)nBS[0][1] != 0)
+	{
+		FilteringEdgeLumaV( pFilter, &pDestY[1<<2], iLineSize, nBS[0][1]);
+	}
+
+	if(*(uint32_t *)nBS[0][2] != 0)
+	{
+		FilteringEdgeLumaV( pFilter, &pDestY[2<<2], iLineSize, nBS[0][2]);
+		FilteringEdgeChromaV( pFilter, &pDestCb[2<<1], &pDestCr[2<<1], iLineSizeUV, nBS[0][2] );
+	}
+
+	if(*(uint32_t *)nBS[0][3] != 0)
+	{
+		FilteringEdgeLumaV( pFilter, &pDestY[3<<2], iLineSize, nBS[0][3] );
+	}
+	
+	if( iBoundryFlag & TOP_FLAG_MASK)
+	{	
+		int32_t iTopXyIndex = iMbXyIndex - pCurDqLayer->iMbWidth;
+        pFilter->iLumaQP = (iCurLumaQp + pCurDqLayer->pLumaQp[iTopXyIndex] + 1) >> 1;
+        pFilter->iChromaQP = (iCurChromaQp + pCurDqLayer->pChromaQp[iTopXyIndex] + 1) >> 1;
+		
+		if(  nBS[1][0][0] == 0x04)
+		{
+			FilteringEdgeLumaIntraH( pFilter, pDestY, iLineSize, NULL );
+			FilteringEdgeChromaIntraH( pFilter, pDestCb, pDestCr, iLineSizeUV, NULL );
+		} 
+		else 
+		{
+			if(*(uint32_t *)nBS[1][0] != 0)
+			{
+				FilteringEdgeLumaH( pFilter, pDestY, iLineSize, nBS[1][0] );
+				FilteringEdgeChromaH( pFilter, pDestCb, pDestCr, iLineSizeUV, nBS[1][0] );
+			}
+		}  
+	}
+	
+	pFilter->iLumaQP = iCurLumaQp;
+	pFilter->iChromaQP = iCurChromaQp;
+
+	if(*(uint32_t *)nBS[1][1] != 0)
+	{
+		FilteringEdgeLumaH( pFilter, &pDestY[(1<<2)*iLineSize], iLineSize, nBS[1][1] );
+	}
+
+	if(*(uint32_t *)nBS[1][2] != 0)
+	{
+		FilteringEdgeLumaH( pFilter, &pDestY[(2<<2)*iLineSize], iLineSize, nBS[1][2] );
+		FilteringEdgeChromaH( pFilter, &pDestCb[(2<<1)*iLineSizeUV], &pDestCr[(2<<1)*iLineSizeUV], iLineSizeUV, nBS[1][2] );
+	}
+
+	if(*(uint32_t *)nBS[1][3] != 0)
+	{
+		FilteringEdgeLumaH( pFilter, &pDestY[(3<<2)*iLineSize], iLineSize, nBS[1][3] );
+	}
+}
+
+void_t /*__FASTCALL*/ FilteringEdgeLumaHV( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag )
+{
+	int32_t iMbXyIndex = pCurDqLayer->iMbXyIndex;
+	int32_t iMbX      = pCurDqLayer->iMbX;
+	int32_t iMbY      = pCurDqLayer->iMbY;
+	int32_t iMbWidth  = pCurDqLayer->iMbWidth;
+	int32_t iLineSize  = pFilter->iCsStride[0];
+
+	uint8_t  *pDestY;	
+	int32_t  iCurQp;
+	int32_t  iIndexA, iAlpha, iBeta;
+
+	FORCE_STACK_ALIGN_1D(int8_t,  iTc,   4, 16 );
+	FORCE_STACK_ALIGN_1D(uint8_t, uiBSx4, 4, 4  );
+
+	pDestY  = pFilter->pCsData[0] + ((iMbY * iLineSize + iMbX) << 4);
+	iCurQp  = pCurDqLayer->pLumaQp[iMbXyIndex];
+	
+	*(uint32_t*)uiBSx4 = 0x03030303;
+
+	// luma v
+	if( iBoundryFlag & LEFT_FLAG_MASK)
+	{
+		pFilter->iLumaQP   = ( iCurQp   + pCurDqLayer->pLumaQp[iMbXyIndex-1] + 1 ) >> 1;		
+		FilteringEdgeLumaIntraV( pFilter, pDestY, iLineSize, NULL );
+	}
+
+	pFilter->iLumaQP   = iCurQp;	
+	GET_ALPHA_BETA_FROM_QP(pFilter->iLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(iTc, iIndexA, uiBSx4, 0);
+		pFilter->pLoopf->pfLumaDeblockingLT4Hor( &pDestY[1 << 2],iLineSize,iAlpha,iBeta,iTc );
+		pFilter->pLoopf->pfLumaDeblockingLT4Hor( &pDestY[2 << 2],iLineSize,iAlpha,iBeta,iTc );
+		pFilter->pLoopf->pfLumaDeblockingLT4Hor( &pDestY[3 << 2],iLineSize,iAlpha,iBeta,iTc );
+	}
+
+	// luma h
+	if( iBoundryFlag & TOP_FLAG_MASK)
+	{
+		pFilter->iLumaQP   = ( iCurQp   + pCurDqLayer->pLumaQp[iMbXyIndex-iMbWidth] + 1 ) >> 1;	
+		FilteringEdgeLumaIntraH( pFilter, pDestY, iLineSize, NULL );
+	}   
+
+	pFilter->iLumaQP   = iCurQp;	
+	if( iAlpha | iBeta )
+	{
+		pFilter->pLoopf->pfLumaDeblockingLT4Ver( &pDestY[(1<<2)*iLineSize],iLineSize,iAlpha,iBeta,iTc );
+		pFilter->pLoopf->pfLumaDeblockingLT4Ver( &pDestY[(2<<2)*iLineSize],iLineSize,iAlpha,iBeta,iTc );
+		pFilter->pLoopf->pfLumaDeblockingLT4Ver( &pDestY[(3<<2)*iLineSize],iLineSize,iAlpha,iBeta,iTc );
+	}
+}
+void_t /*__FASTCALL*/ FilteringEdgeChromaHV( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag )
+{
+	int32_t iMbXyIndex     = pCurDqLayer->iMbXyIndex;
+	int32_t iMbX      = pCurDqLayer->iMbX;
+	int32_t iMbY      = pCurDqLayer->iMbY;
+	int32_t iMbWidth  = pCurDqLayer->iMbWidth;
+	int32_t iLineSize  = pFilter->iCsStride[1];
+
+	uint8_t  *pDestCb, *pDestCr;	
+	int32_t  iCurQp;
+	int32_t  iIndexA, iAlpha, iBeta;
+	
+	FORCE_STACK_ALIGN_1D( int8_t,  iTc,   4, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, uiBSx4, 4, 4  );
+
+	pDestCb = pFilter->pCsData[1] + ((iMbY * iLineSize + iMbX) << 3);				
+	pDestCr = pFilter->pCsData[2] + ((iMbY * iLineSize + iMbX) << 3);	
+	iCurQp  = pCurDqLayer->pChromaQp[iMbXyIndex];
+	*(uint32_t*)uiBSx4 = 0x03030303;
+
+	// chroma v
+	if( iBoundryFlag & LEFT_FLAG_MASK)
+	{
+		pFilter->iChromaQP = ( iCurQp + pCurDqLayer->pChromaQp[iMbXyIndex-1] + 1 ) >> 1;	
+		FilteringEdgeChromaIntraV( pFilter, pDestCb, pDestCr, iLineSize, NULL);
+	}
+
+	pFilter->iChromaQP   = iCurQp;	
+	GET_ALPHA_BETA_FROM_QP(pFilter->iChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIndexA, iAlpha, iBeta);
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(iTc, iIndexA, uiBSx4, 1);
+		pFilter->pLoopf->pfChromaDeblockingLT4Hor( &pDestCb[2 << 1],&pDestCr[2 << 1],iLineSize,iAlpha,iBeta,iTc );
+	}
+
+	// chroma h
+	if( iBoundryFlag & TOP_FLAG_MASK)
+	{
+		pFilter->iChromaQP = ( iCurQp + pCurDqLayer->pChromaQp[iMbXyIndex-iMbWidth] + 1 ) >> 1;		
+		FilteringEdgeChromaIntraH( pFilter, pDestCb, pDestCr, iLineSize, NULL);
+	}   
+
+	pFilter->iChromaQP   = iCurQp;	
+	if( iAlpha | iBeta )
+	{
+		pFilter->pLoopf->pfChromaDeblockingLT4Ver( &pDestCb[(2<<1)*iLineSize],&pDestCr[(2<<1)*iLineSize],iLineSize,iAlpha,iBeta,iTc );
+	}
+}
+
+// merge h&v lookup table operation to save performance
+void_t DeblockingIntraMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag )
+{
+	FilteringEdgeLumaHV(pCurDqLayer, pFilter, iBoundryFlag);
+	FilteringEdgeChromaHV(pCurDqLayer, pFilter, iBoundryFlag);
+}
+
+void_t WelsDeblockingMb( PDqLayer pCurDqLayer, PDeblockingFilter  pFilter, int32_t iBoundryFlag )
+{
+	uint8_t nBS[2][4][4] = { 0 };
+
+	int32_t iMbXyIndex	= pCurDqLayer->iMbXyIndex;
+	int32_t iCurMbType  = pCurDqLayer->pMbType[iMbXyIndex];
+    int32_t iMbNb;
+	
+	switch( iCurMbType )
+    {
+	case MB_TYPE_INTRA4x4:
+	case MB_TYPE_INTRA16x16:
+	case MB_TYPE_INTRA_PCM:
+		DeblockingIntraMb( pCurDqLayer, pFilter, iBoundryFlag );
+		break;
+	default:
+
+        if(iBoundryFlag & LEFT_FLAG_MASK)
+        {
+            iMbNb = iMbXyIndex - 1;
+            *(uint32_t*)nBS[0][0] = IS_INTRA(pCurDqLayer->pMbType[iMbNb])?0x04040404:DeblockingBsMarginalMBAvcbase( pCurDqLayer, 0, iMbNb, iMbXyIndex);
+        }
+		else
+		{
+			*(uint32_t*)nBS[0][0] = 0;
+		}
+        if(iBoundryFlag & TOP_FLAG_MASK)
+        {
+             iMbNb = iMbXyIndex - pCurDqLayer->iMbWidth;
+           *(uint32_t*)nBS[1][0] = IS_INTRA(pCurDqLayer->pMbType[iMbNb])?0x04040404:DeblockingBsMarginalMBAvcbase( pCurDqLayer, 1, iMbNb, iMbXyIndex);
+        }
+		else
+		{
+			*(uint32_t*)nBS[1][0] = 0;
+		}
+		//SKIP MB_16x16 or others
+		if( iCurMbType != MB_TYPE_SKIP )
+		{
+			if( iCurMbType == MB_TYPE_16x16 )
+			{
+				DeblockingBSInsideMBAvsbase( pCurDqLayer->pNzc[iMbXyIndex], nBS, 1 );
+			} 
+			else 
+			{
+				DeblockingBSInsideMBNormal(pCurDqLayer, nBS, pCurDqLayer->pNzc[iMbXyIndex], iMbXyIndex);
+			}
+		}
+		else
+		{
+			*(uint32_t*)nBS[0][1] = *(uint32_t*)nBS[0][2] = *(uint32_t*)nBS[0][3] = 
+			*(uint32_t*)nBS[1][1] = *(uint32_t*)nBS[1][2] = *(uint32_t*)nBS[1][3] = 0;
+		}
+		DeblockingInterMb( pCurDqLayer, pFilter, nBS, iBoundryFlag );
+		break;
+   }
+}
+
+//  C code only
+void_t DeblockLumaLt4_c( uint8_t *pPix, int32_t iStrideX,int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc )
+{
+	for( int32_t i = 0;i<16;i++)
+	{
+		int32_t iTc0 = pTc[i>>2];
+		if(iTc0>=0)
+		{
+				int32_t p0 = pPix[-iStrideX];	
+				int32_t p1 = pPix[-2*iStrideX];	
+				int32_t p2 = pPix[-3*iStrideX];	
+				int32_t q0 = pPix[0];	
+				int32_t q1 = pPix[iStrideX];	
+				int32_t q2 = pPix[2*iStrideX];	
+				bool_t bDetaP0Q0= WELS_ABS( p0 - q0 )<iAlpha;
+				bool_t bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
+				bool_t bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+				int32_t iTc = iTc0;
+				if ( bDetaP0Q0&& bDetaP1P0 && bDetaQ1Q0 )
+				{	
+					bool_t bDetaP2P0 =  WELS_ABS( p2 - p0 ) < iBeta;
+					bool_t bDetaQ2Q0 =  WELS_ABS( q2 - q0 ) < iBeta;
+					if ( bDetaP2P0) 
+					{
+						pPix[-2*iStrideX] = p1 + WELS_CLIP3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -iTc0, iTc0 );
+						iTc++;
+					}
+					if (bDetaQ2Q0)
+					{
+						pPix[iStrideX] = q1 + WELS_CLIP3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -iTc0, iTc0 );
+						iTc++;
+					}
+					int32_t iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc );
+					pPix[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
+					pPix[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
+			}
+		}
+		pPix +=iStrideY;
+	}
+}
+void_t DeblockLumaEq4_c( uint8_t *pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta )
+{
+	int32_t p0,p1,p2,q0,q1,q2;
+	int32_t iDetaP0Q0;
+	bool_t bDetaP1P0,bDetaQ1Q0;
+	for (int32_t i = 0;i<16;i++)
+	{
+		p0 = pPix[-iStrideX];
+		p1 = pPix[-2*iStrideX];
+		p2 = pPix[-3*iStrideX];							
+		q0 = pPix[0];
+		q1 = pPix[iStrideX];
+		q2 = pPix[2*iStrideX];
+		iDetaP0Q0 = WELS_ABS( p0 - q0 );
+		bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
+		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+		if ((iDetaP0Q0<iAlpha) && bDetaP1P0 &&bDetaQ1Q0)
+		{
+			if (iDetaP0Q0< (( iAlpha >> 2 ) + 2 ) )
+			{
+				bool_t bDetaP2P0 = WELS_ABS( p2 - p0 ) < iBeta;
+				bool_t bDetaQ2Q0 =  WELS_ABS( q2 - q0 ) < iBeta;
+				if(bDetaP2P0)
+				{	
+					const int32_t p3 = pPix[-4*iStrideX];	
+					pPix[-iStrideX] = ( p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4 ) >> 3;	 //p0
+					pPix[-2*iStrideX] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;	 //p1
+					pPix[-3*iStrideX] = ( (p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4 ) >> 3;//p2
+				 } 
+				 else 
+				 {
+					pPix[-1*iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;	//p0
+			     }	
+				 if (bDetaQ2Q0)	
+				 {	
+					const int32_t q3 = pPix[3*iStrideX];		
+					pPix[0] = ( p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4 ) >> 3; //q0
+					pPix[iStrideX] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; //q1
+					pPix[2*iStrideX] = ( (q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4 ) >> 3;//q2
+				  } 
+				  else 
+				  {	
+					pPix[0] = ( (q1 << 1) + q0 + p1 + 2 ) >> 2; //q0
+				  }
+			}
+			else
+			{
+			 	pPix[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2; //p0
+				pPix[ 0] = ( (q1 << 1) + q0 + p1 + 2 ) >> 2; //q0
+			}
+		}
+	 pPix += iStrideY;
+	} 
+}
+void_t DeblockLumaLt4V_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *tc )
+{
+	DeblockLumaLt4_c( pPix, iStride, 1, iAlpha, iBeta, tc );
+}
+void_t DeblockLumaLt4H_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *tc )
+{
+	DeblockLumaLt4_c( pPix, 1, iStride, iAlpha, iBeta, tc );
+}
+void_t DeblockLumaEq4V_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta )
+{
+	DeblockLumaEq4_c( pPix, iStride, 1, iAlpha, iBeta);
+}
+void_t DeblockLumaEq4H_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta )
+{
+	DeblockLumaEq4_c( pPix, 1, iStride, iAlpha, iBeta );
+}
+void_t DeblockChromaLt4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc )
+{
+	int32_t p0, p1, q0, q1,iDeta;
+	bool_t bDetaP0Q0,bDetaP1P0,bDetaQ1Q0;
+
+	for(int32_t i = 0;i<8;i++)
+	{
+		int32_t iTc0 = pTc[i>>1];
+		if(iTc0 >0)
+		{
+		p0 = pPixCb[-iStrideX];
+		p1 = pPixCb[-2*iStrideX];
+		q0 = pPixCb[0];
+		q1 = pPixCb[iStrideX];		
+
+		bDetaP0Q0 =  WELS_ABS( p0 - q0 ) < iAlpha;
+		bDetaP1P0 =  WELS_ABS( p1 - p0 ) < iBeta;
+		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+		if( bDetaP0Q0&&bDetaP1P0 &&	bDetaQ1Q0) 
+		{
+			iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0 );
+			pPixCb[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
+			pPixCb[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
+		}
+	
+
+		p0 = pPixCr[-iStrideX];
+		p1 = pPixCr[-2*iStrideX];
+		q0 = pPixCr[0];
+		q1 = pPixCr[iStrideX];	
+
+		bDetaP0Q0 =  WELS_ABS( p0 - q0 ) < iAlpha;
+		bDetaP1P0 =  WELS_ABS( p1 - p0 ) < iBeta;
+		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+
+		if( bDetaP0Q0&&bDetaP1P0 &&	bDetaQ1Q0) 
+		{
+			iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0 );
+			pPixCr[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
+			pPixCr[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
+		}
+		}
+		pPixCb += iStrideY;
+		pPixCr += iStrideY;
+	}
+}
+void_t DeblockChromaEq4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta )
+{
+	int32_t i = 0, d = 0;
+	int32_t p0, p1, q0, q1;
+	bool_t bDetaP0Q0,bDetaP1P0,bDetaQ1Q0;
+	for(int32_t i =0;i<8;i++)
+	{
+		    //cb
+			p0 = pPixCb[-iStrideX];
+			p1 = pPixCb[-2*iStrideX];
+			q0 = pPixCb[0];
+			q1 = pPixCb[iStrideX];
+			bDetaP0Q0 = WELS_ABS( p0 - q0 ) < iAlpha;
+			bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
+			bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+			if(bDetaP0Q0&&bDetaP1P0&&bDetaQ1Q0)
+			{
+				pPixCb[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;   /* p0' */
+				pPixCb[0]  = ( (q1 << 1) + q0 + p1 + 2 ) >> 2;   /* q0' */
+			}
+			
+			//cr
+			p0 = pPixCr[-iStrideX];
+			p1 = pPixCr[-2*iStrideX];
+			q0 = pPixCr[0];
+			q1 = pPixCr[iStrideX];
+			bDetaP0Q0 = WELS_ABS( p0 - q0 ) < iAlpha;
+			bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
+			bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+			if(bDetaP0Q0&&bDetaP1P0&&bDetaQ1Q0)
+			{
+				pPixCr[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;   /* p0' */
+				pPixCr[0]  = ( (q1 << 1) + q0 + p1 + 2 ) >> 2;   /* q0' */
+			}
+			pPixCr += iStrideY;	
+			pPixCb += iStrideY;	
+	}
+}
+void_t DeblockChromaLt4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *tc )
+{
+	DeblockChromaLt4_c( pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc );
+}
+void_t DeblockChromaLt4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *tc )
+{
+	DeblockChromaLt4_c( pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc );
+}
+void_t DeblockChromaEq4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta )
+{
+	DeblockChromaEq4_c( pPixCb, pPixCr, iStride, 1, iAlpha, iBeta );
+}
+void_t DeblockChromaEq4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta )
+{
+	DeblockChromaEq4_c( pPixCb, pPixCr, 1, iStride, iAlpha, iBeta );
+}
+
+#ifdef X86_ASM
+extern "C" {
+void DeblockLumaLt4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc)
+{
+    FORCE_STACK_ALIGN_1D(uint8_t,  uiBuf,   16*8, 16);
+    
+    DeblockLumaTransposeH2V_sse2(pPixY - 4, iStride, &uiBuf[0]);
+	DeblockLumaLt4V_sse2(&uiBuf[4*16], 16, iAlpha, iBeta, pTc);
+	DeblockLumaTransposeV2H_sse2(pPixY - 4, iStride, &uiBuf[0]);
+}
+
+void DeblockLumaEq4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta)
+{
+	FORCE_STACK_ALIGN_1D(uint8_t,  uiBuf,   16*8, 16);
+    
+    DeblockLumaTransposeH2V_sse2(pPixY - 4, iStride, &uiBuf[0]);
+	DeblockLumaEq4V_sse2(&uiBuf[4*16], 16, iAlpha, iBeta);
+	DeblockLumaTransposeV2H_sse2(pPixY - 4, iStride, &uiBuf[0]);
+}
+
+}
+
+#endif
+/*!
+ * \brief	AVC slice deblocking filtering target layer
+ *
+ * \param	dec			Wels avc decoder context
+ *
+ * \return	NONE
+ */
+void_t WelsDeblockingFilterSlice( PWelsDecoderContext pCtx, PDeblockingFilterMbFunc pDeblockMb )
+{
+	PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
+	PSliceHeaderExt pSliceHeaderExt = &pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt;
+	int32_t iMbWidth  = pCurDqLayer->iMbWidth;
+	int32_t iTotalMbCount = pSliceHeaderExt->sSliceHeader.pSps->uiTotalMbCount;
+
+	SDeblockingFilter pFilter = {0};
+
+	PFmo pFmo = pCtx->pFmo;
+	int32_t iNextMbXyIndex = 0;
+	int32_t iTotalNumMb = pCurDqLayer->sLayerInfo.sSliceInLayer.iTotalMbInCurSlice;
+	int32_t iCountNumMb = 0;
+	int32_t iBoundryFlag;
+	int32_t iFilterIdc = pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc;
+
+	/* Step1: parameters set */	
+	pFilter.pCsData[0] = pCtx->pDec->pData[0];
+	pFilter.pCsData[1] = pCtx->pDec->pData[1];
+	pFilter.pCsData[2] = pCtx->pDec->pData[2];
+	
+	pFilter.iCsStride[0] = pCtx->pDec->iLinesize[0];
+	pFilter.iCsStride[1] = pCtx->pDec->iLinesize[1];
+	
+	pFilter.eSliceType = (ESliceType) pCurDqLayer->sLayerInfo.sSliceInLayer.eSliceType;
+	
+	pFilter.iSliceAlphaC0Offset = pSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
+	pFilter.iSliceBetaOffset     = pSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
+
+	pFilter.pLoopf = &pCtx->sDeblockingFunc;
+
+	/* Step2: macroblock deblocking */
+    if( 0 == iFilterIdc || 2 == iFilterIdc )
+    {
+		iNextMbXyIndex = pSliceHeaderExt->sSliceHeader.iFirstMbInSlice;
+		pCurDqLayer->iMbX  = iNextMbXyIndex % iMbWidth;
+		pCurDqLayer->iMbY  = iNextMbXyIndex / iMbWidth; 
+		pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
+
+		do 
+		{
+			iBoundryFlag = DeblockingAvailableNoInterlayer(pCurDqLayer, iFilterIdc);
+
+			pDeblockMb( pCurDqLayer, &pFilter, iBoundryFlag );
+
+			++iCountNumMb;
+			if ( iCountNumMb >= iTotalNumMb )
+			{
+				break;
+			}
+
+			if ( pSliceHeaderExt->sSliceHeader.pPps->uiNumSliceGroups > 1 )
+			{
+				iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
+			}
+			else
+			{
+				++iNextMbXyIndex;
+			}
+			if ( -1 == iNextMbXyIndex || iNextMbXyIndex >= iTotalMbCount )	// slice group boundary or end of a frame
+			{
+				break;
+			}
+
+			pCurDqLayer->iMbX  = iNextMbXyIndex % iMbWidth;
+			pCurDqLayer->iMbY  = iNextMbXyIndex / iMbWidth;
+			pCurDqLayer->iMbXyIndex = iNextMbXyIndex;
+		} while ( 1 );
+	}
+}
+/*!
+ * \brief	deblocking module initialize 
+ *
+ * \param	pf
+ *          cpu
+ *
+ * \return	NONE
+ */
+ 
+void_t  DeblockingInit( SDeblockingFunc  *pFunc,  int32_t iCpu )
+{
+	pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_c;
+	pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_c;
+	pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_c;
+	pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_c;
+
+	pFunc->pfChromaDeblockingLT4Ver	    = DeblockChromaLt4V_c;
+	pFunc->pfChromaDeblockingEQ4Ver	    = DeblockChromaEq4V_c;
+	pFunc->pfChromaDeblockingLT4Hor	    = DeblockChromaLt4H_c;
+	pFunc->pfChromaDeblockinEQ4Hor	    = DeblockChromaEq4H_c;
+
+#ifdef X86_ASM
+	if( iCpu & WELS_CPU_SSE2 ){
+	    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
+	    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
+		pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_sse2;
+		pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_sse2;
+	    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
+	    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
+	    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
+	    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
+	}
+#endif
+
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/decode_mb_aux.cpp
@@ -1,0 +1,134 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <string.h>
+
+#include "decode_mb_aux.h"
+#include "wels_common_basis.h"
+
+namespace WelsDec {
+
+#define MAX_NEG_CROP 1024
+uint8_t g_ClipTable[256 + 2 * MAX_NEG_CROP];	//the front 1024 is 0, the back 1024 is 255, the middle 256 elements is 0-255
+
+
+/* init pClip table to pClip the final dct data */
+void_t InitDctClipTable(void_t)
+{
+	uint8_t *p		        = &g_ClipTable[0];
+	const int32_t kiLength	= MAX_NEG_CROP * sizeof(uint8_t);
+	int32_t i               = 0;
+	
+	do
+    {
+		const int32_t kiIdx = MAX_NEG_CROP + i;
+
+		p[kiIdx]	= i;
+		p[1+kiIdx]	= 1+i;
+		p[2+kiIdx]	= 2+i;
+		p[3+kiIdx]	= 3+i;
+
+		i += 4;
+	} while(i < 256);
+
+	memset( p, 0, kiLength);
+	memset( p + MAX_NEG_CROP + 256, 0xFF, kiLength);
+}
+
+//NOTE::: p_RS should NOT be modified and it will lead to mismatch with JSVM.
+//        so should allocate kA array to store the temporary value (idct).
+void_t IdctResAddPred_c(uint8_t *pPred, const int32_t kiStride, int16_t *pRs)
+{
+	int16_t iSrc[16];
+
+	uint8_t *pDst			= pPred;
+	const int32_t kiStride2	= kiStride<<1;
+	const int32_t kiStride3	= kiStride + kiStride2;
+	uint8_t *pClip			= &g_ClipTable[MAX_NEG_CROP];	
+	int32_t i;
+
+	for(i=0; i<4; i++)
+	{
+ 		const int32_t kiY  = i<<2;
+		const int32_t kiT0 = pRs[kiY] + pRs[kiY+2];
+		const int32_t kiT1 = pRs[kiY] - pRs[kiY+2];
+		const int32_t kiT2 = (pRs[kiY+1]>>1) - pRs[kiY+3];
+		const int32_t kiT3 = pRs[kiY+1] + (pRs[kiY+3]>>1);
+
+		iSrc[kiY] = kiT0 + kiT3;
+		iSrc[kiY+1] = kiT1 + kiT2;
+		iSrc[kiY+2] = kiT1 - kiT2;
+		iSrc[kiY+3] = kiT0 - kiT3;
+	}
+
+	for(i=0; i<4; i++)
+	{
+		int32_t kT1	= iSrc[i]	+ iSrc[i+8];
+		int32_t kT2	= iSrc[i+4] + (iSrc[i+12]>>1);
+		int32_t kT3	= (32 + kT1 + kT2) >> 6;
+		int32_t kT4	= (32 + kT1 - kT2) >> 6;
+		
+		pDst[i] = pClip[ kT3 + pPred[i] ];
+		pDst[i+kiStride3] = pClip[ kT4 + pPred[i+kiStride3] ];
+
+		kT1	= iSrc[i] - iSrc[i+8];
+		kT2	= (iSrc[i+4]>>1) - iSrc[i+12];
+		pDst[i+kiStride] = pClip[ ((32 + kT1 + kT2) >> 6) + pDst[i+kiStride] ];
+		pDst[i+kiStride2] = pClip[ ((32 + kT1 - kT2) >> 6) + pDst[i+kiStride2] ];
+	}
+}
+
+void_t GetI4LumaIChromaAddrTable(int32_t *pBlockOffset, const int32_t kiYStride, const int32_t kiUVStride)
+{
+	int32_t *pOffset	   = pBlockOffset;
+	int32_t i;
+	const uint8_t kuiScan0 = g_kuiScan8[0];
+
+	for(i=0; i<16; i++)
+	{
+		const uint32_t kuiA = g_kuiScan8[i] - kuiScan0;
+		const uint32_t kuiX = kuiA & 0x07;
+		const uint32_t kuiY = kuiA >> 3;
+
+		pOffset[i]= (kuiX + kiYStride* kuiY) << 2;
+	}
+
+	for(i=0; i<4; i++)
+	{
+		const uint32_t kuiA = g_kuiScan8[i] - kuiScan0;
+
+		pOffset[16+i]=
+		pOffset[20+i]= ((kuiA & 0x07) + (kiUVStride/*>>1*/) * (kuiA >> 3)) << 2;
+	}
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/decode_slice.cpp
@@ -1,0 +1,1361 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  Abstract
+ *      current slice decoding
+ *
+ *  History
+ *      07/10/2008 Created
+ *      08/09/2013 Modified
+ *
+ *****************************************************************************/
+#include <memory.h>
+
+#include "typedefs.h"
+#include "dec_golomb.h"
+
+#include "fmo.h"
+#include "deblocking.h"
+#include "utils.h"
+
+#include "decode_slice.h"
+
+#include "error_code.h"
+#include "decode_mb_aux.h"
+#include "parse_mb_syn_cavlc.h"
+#include "rec_mb.h"
+#include "mv_pred.h"
+
+#include "as264_common.h"
+#include "cpu_core.h"
+#include "expand_pic.h"
+
+namespace WelsDec {
+
+int32_t WelsTargetSliceConstruction( PWelsDecoderContext pCtx )
+{
+	int32_t iPreQP = 0;
+
+	PDqLayer pCurLayer = pCtx->pCurDqLayer;
+	PSlice pCurSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
+	PSliceHeader pSliceHeader = &pCurSlice->sSliceHeaderExt.sSliceHeader;
+
+	int32_t iTotalMbTargetLayer = pSliceHeader->pSps->uiTotalMbCount;
+
+	int32_t iCurLayerWidth  = pCurLayer->iMbWidth << 4; 
+	int32_t iCurLayerHeight = pCurLayer->iMbHeight << 4;
+
+	int32_t iNextMbXyIndex = 0;
+	PFmo pFmo = pCtx->pFmo;
+
+	int32_t iTotalNumMb = pCurSlice->iTotalMbInCurSlice;
+	int32_t iCountNumMb = 0;
+	PDeblockingFilterMbFunc pDeblockMb;
+
+	if ( !pCtx->bAvcBasedFlag && iCurLayerWidth != pCtx->iCurSeqIntervalMaxPicWidth ) 
+	{
+		return -1;
+	}
+
+	iNextMbXyIndex   = pSliceHeader->iFirstMbInSlice;
+	pCurLayer->iMbX  = iNextMbXyIndex % pCurLayer->iMbWidth;
+	pCurLayer->iMbY  = iNextMbXyIndex / pCurLayer->iMbWidth; 
+	pCurLayer->iMbXyIndex = iNextMbXyIndex;	
+
+	if ( 0 == iNextMbXyIndex )
+	{
+		pCurLayer->pDec->iSpsId = pSliceHeader->iSpsId;
+		pCurLayer->pDec->iPpsId = pSliceHeader->iPpsId;
+
+		pCurLayer->pDec->uiQualityId = pCurLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
+	}
+
+	do 
+	{
+		iPreQP = pCurLayer->pLumaQp[pCurLayer->iMbXyIndex];
+		
+		if ( WelsTargetMbConstruction( pCtx ) )
+		{
+			WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetSliceConstruction():::MB(%d, %d) construction error. pCurSlice_type:%d\n",
+				pCurLayer->iMbX, pCurLayer->iMbY, pCurSlice->eSliceType );
+
+			return -1;
+		}
+
+		++iCountNumMb;
+		++pCurLayer->pDec->iTotalNumMbRec;
+		if ( iCountNumMb >= iTotalNumMb )
+		{
+			break;
+		}		
+		if ( pCurLayer->pDec->iTotalNumMbRec > iTotalMbTargetLayer )
+		{
+			WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetSliceConstruction():::fdec->iTotalNumMbRec:%d, iTotalMbTargetLayer:%d\n",
+				pCurLayer->pDec->iTotalNumMbRec, iTotalMbTargetLayer );
+
+			return -1;
+		}		
+		
+		if ( pSliceHeader->pPps->uiNumSliceGroups > 1 )
+		{
+			iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
+		}
+		else
+		{
+			++iNextMbXyIndex;
+		}
+		if ( -1 == iNextMbXyIndex || iNextMbXyIndex >= iTotalMbTargetLayer )	// slice group boundary or end of a frame
+		{
+			break;
+		}
+		pCurLayer->iMbX  = iNextMbXyIndex % pCurLayer->iMbWidth;
+		pCurLayer->iMbY  = iNextMbXyIndex / pCurLayer->iMbWidth;
+		pCurLayer->iMbXyIndex = iNextMbXyIndex;
+	} while (1);
+	
+	pCtx->pDec->iWidthInPixel  = iCurLayerWidth;
+	pCtx->pDec->iHeightInPixel = iCurLayerHeight;
+
+	if((pCurSlice->eSliceType != I_SLICE)&&(pCurSlice->eSliceType != P_SLICE))
+		return 0;
+
+    pDeblockMb = WelsDeblockingMb; 
+
+	if ( 1 == pSliceHeader->uiDisableDeblockingFilterIdc )
+	{
+		return 0;//NO_SUPPORTED_FILTER_IDX
+	}
+	else
+	{
+		WelsDeblockingFilterSlice( pCtx, pDeblockMb );
+
+	}
+	// any other filter_idc not supported here, 7/22/2010
+
+	return 0;
+}
+
+int32_t WelsMbInterSampleConstruction( PWelsDecoderContext pCtx, PDqLayer pCurLayer, 
+											  uint8_t* pDstY, uint8_t* pDstU, uint8_t* pDstV, int32_t iStrideL, int32_t iStrideC )
+{
+	int32_t iMbXy = pCurLayer->iMbXyIndex;
+	int32_t i, iIndex, iOffset;
+
+	WelsChromaDcIdct( pCurLayer->pScaledTCoeff[iMbXy] + 256 );	// 256 = 16*16
+	WelsChromaDcIdct( pCurLayer->pScaledTCoeff[iMbXy] + 320 );	// 320 = 16*16 + 16*4
+
+	for(i=0; i<16; i++) //luma
+	{
+		iIndex = g_kuiMbNonZeroCountIdx[i];
+		if( pCurLayer->pNzc[iMbXy][iIndex] )
+		{
+			iOffset = ((iIndex>>2)<<2) * iStrideL + ((iIndex%4)<<2);			
+			pCtx->pIdctResAddPredFunc( pDstY+iOffset, iStrideL, pCurLayer->pScaledTCoeff[iMbXy]+(i<<4) );
+		}
+	}
+
+	for ( i = 0; i < 4; i++ ) //chroma
+	{
+		iIndex = g_kuiMbNonZeroCountIdx[i+16]; //Cb
+		if ( pCurLayer->pNzc[iMbXy][iIndex] || *(pCurLayer->pScaledTCoeff[iMbXy]+((i+16)<<4)) )
+		{
+			iOffset = (((iIndex-16)>>2)<<2) * iStrideC + (((iIndex-16)%4)<<2);			
+			pCtx->pIdctResAddPredFunc( pDstU+iOffset, iStrideC, pCurLayer->pScaledTCoeff[iMbXy]+((i+16)<<4) );
+		}
+
+		iIndex = g_kuiMbNonZeroCountIdx[i+20]; //Cr
+		if ( pCurLayer->pNzc[iMbXy][iIndex] || *(pCurLayer->pScaledTCoeff[iMbXy]+((i+20)<<4)) )
+		{
+			iOffset = (((iIndex-18)>>2)<<2) * iStrideC + (((iIndex-18)%4)<<2);			
+			pCtx->pIdctResAddPredFunc( pDstV+iOffset, iStrideC , pCurLayer->pScaledTCoeff[iMbXy]+((i+20)<<4));
+		}
+	}
+
+	return 0;
+}
+int32_t WelsMbInterConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer)
+{
+	int32_t iMbX = pCurLayer->iMbX;
+	int32_t iMbY = pCurLayer->iMbY;
+	uint8_t  *pDstY, *pDstCb, *pDstCr;
+
+	int32_t iLumaStride   = pCtx->pDec->iLinesize[0];
+	int32_t iChromaStride = pCtx->pDec->iLinesize[1];
+
+	pDstY  = pCurLayer->pDec->pData[0] + ((iMbY * iLumaStride + iMbX)<<4);
+	pDstCb = pCurLayer->pDec->pData[1] + ((iMbY * iChromaStride + iMbX)<<3);
+	pDstCr = pCurLayer->pDec->pData[2] + ((iMbY * iChromaStride + iMbX)<<3);
+
+	GetInterPred(pDstY, pDstCb, pDstCr, pCtx);
+	WelsMbInterSampleConstruction( pCtx, pCurLayer, pDstY, pDstCb, pDstCr, iLumaStride, iChromaStride );
+
+	pCtx->sBlockFunc.pWelsSetNonZeroCountFunc(NULL, pCurLayer->pNzc[pCurLayer->iMbXyIndex]);// set all none-zero nzc to 1; dbk can be opti!
+	return 0;
+}
+
+void_t WelsLumaDcDequantIdct(int16_t *pBlock, int32_t iQp){
+    const int32_t kiQMul= g_kuiDequantCoeff[iQp][0];
+#define STRIDE 16
+    int32_t i;
+    int32_t iTemp[16]; //FIXME check if this is a good idea
+	int16_t* pBlk = pBlock;
+    static const int32_t kiXOffset[4]={0, STRIDE, STRIDE<<2,  5*STRIDE};
+    static const int32_t kiYOffset[4]={0, STRIDE<<1, STRIDE<<3, 10*STRIDE};
+
+    for(i=0; i<4; i++){
+        const int32_t kiOffset= kiYOffset[i];
+		const int32_t kiX1 = kiOffset + kiXOffset[2];
+		const int32_t kiX2 = STRIDE + kiOffset;
+		const int32_t kiX3 = kiOffset + kiXOffset[3];
+		const int32_t kiI4 = i << 2;	// 4*i
+        const int32_t kiZ0= pBlk[kiOffset] + pBlk[kiX1];
+        const int32_t kiZ1= pBlk[kiOffset] - pBlk[kiX1];
+        const int32_t kiZ2= pBlk[kiX2] - pBlk[kiX3];
+        const int32_t kiZ3= pBlk[kiX2] + pBlk[kiX3];
+
+        iTemp[kiI4]  = kiZ0+kiZ3;
+        iTemp[1+kiI4]= kiZ1+kiZ2;
+        iTemp[2+kiI4]= kiZ1-kiZ2;
+        iTemp[3+kiI4]= kiZ0-kiZ3;
+    }
+
+    for(i=0; i<4; i++){
+        const int32_t kiOffset= kiXOffset[i];
+		const int32_t kiI4 = 4 + i;
+        const int32_t kiZ0= iTemp[i] + iTemp[4+kiI4];
+        const int32_t kiZ1= iTemp[i] - iTemp[4+kiI4];
+        const int32_t kiZ2= iTemp[kiI4] - iTemp[8+kiI4];
+        const int32_t kiZ3= iTemp[kiI4] + iTemp[8+kiI4];
+
+        pBlk[kiOffset]= ((kiZ0 + kiZ3)*kiQMul + 2)>>2; //FIXME think about merging this into decode_resdual
+        pBlk[kiYOffset[1] +kiOffset]= ((kiZ1 + kiZ2)*kiQMul + 2)>>2;
+        pBlk[kiYOffset[2] +kiOffset]= ((kiZ1 - kiZ2)*kiQMul + 2)>>2;
+        pBlk[kiYOffset[3] +kiOffset]= ((kiZ0 - kiZ3)*kiQMul + 2)>>2;
+    }
+	#undef STRIDE
+}
+
+int32_t WelsMbIntraPredictionConstruction(PWelsDecoderContext pCtx, PDqLayer pCurLayer, bool_t bOutput)
+{
+//seems IPCM should not enter this path
+	int32_t iMbXy = pCurLayer->iMbXyIndex;
+
+	FORCE_STACK_ALIGN_1D( int16_t, pTempScaledTCoeff, MB_COEFF_LIST_SIZE, 16 );
+
+	memcpy(pTempScaledTCoeff, pCurLayer->pScaledTCoeff[iMbXy], 384*sizeof(pCurLayer->pScaledTCoeff[iMbXy][0]));
+
+	WelsFillRecNeededMbInfo(pCtx, bOutput, pCurLayer);
+	
+	if(IS_INTRA16x16(pCurLayer->pMbType[iMbXy]))
+	{
+		int32_t i,j;
+		// really need?
+		for(i=0; i<16; i++)
+		{
+			j = g_kuiLumaDcZigzagScan[i];
+			pTempScaledTCoeff[j] = pCurLayer->pScaledTCoeff[iMbXy][j];
+		}
+		WelsLumaDcDequantIdct(pTempScaledTCoeff, pCurLayer->pLumaQp[iMbXy]);
+		RecI16x16Mb(iMbXy, pCtx,pTempScaledTCoeff,pCurLayer);
+
+		return 0;
+	}
+		
+	if(IS_INTRA4x4(pCurLayer->pMbType[iMbXy]))
+		RecI4x4Mb(iMbXy, pCtx,pTempScaledTCoeff,pCurLayer);
+		
+	return 0;
+}
+
+int32_t WelsMbInterPrediction(PWelsDecoderContext pCtx, PDqLayer pCurLayer)
+{
+	int32_t iMbX = pCurLayer->iMbX;
+	int32_t iMbY = pCurLayer->iMbY;
+	uint8_t  *pDstY, *pDstCb, *pDstCr;
+
+	int32_t iLumaStride   = pCtx->pDec->iLinesize[0];
+	int32_t iChromaStride = pCtx->pDec->iLinesize[1];
+
+	pDstY  = pCurLayer->pDec->pData[0] + ((iMbY * iLumaStride + iMbX)<<4);
+	pDstCb = pCurLayer->pDec->pData[1] + ((iMbY * iChromaStride + iMbX)<<3);
+	pDstCr = pCurLayer->pDec->pData[2] + ((iMbY * iChromaStride + iMbX)<<3);
+
+	GetInterPred(pDstY, pDstCb, pDstCr, pCtx);
+
+	return 0;
+}
+
+void_t WelsMbCopy( uint8_t *pDst, int32_t iStrideDst, uint8_t *pSrc, int32_t iStrideSrc, 
+				 int32_t iHeight, int32_t iWidth )
+{
+	int32_t i;
+	int32_t iOffsetDst = 0, iOffsetSrc = 0;
+	for ( i = 0; i < iHeight; i++ )
+	{
+		memcpy( pDst+iOffsetDst, pSrc+iOffsetSrc, iWidth );
+		iOffsetDst += iStrideDst;
+		iOffsetSrc += iStrideSrc;
+	}
+}
+
+
+int32_t WelsTargetMbConstruction(PWelsDecoderContext pCtx)
+{
+	PDqLayer pCurLayer = pCtx->pCurDqLayer;	
+	if ( MB_TYPE_INTRA_PCM == pCurLayer->pMbType[pCurLayer->iMbXyIndex] )
+	{		
+		//copy cs into fdec
+		int32_t iCsStrideL = pCurLayer->iCsStride[0];
+		int32_t iCsStrideC = pCurLayer->iCsStride[1];
+
+		int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
+		int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 
+
+		int32_t iCsOffsetL = ( pCurLayer->iMbX + pCurLayer->iMbY * iCsStrideL ) << 4;
+		int32_t iCsOffsetC = ( pCurLayer->iMbX + pCurLayer->iMbY * iCsStrideC ) << 3;
+		
+		int32_t iDecOffsetL = ( pCurLayer->iMbX + pCurLayer->iMbY * iDecStrideL ) << 4;
+		int32_t iDecOffsetC = ( pCurLayer->iMbX + pCurLayer->iMbY * iDecStrideC ) << 3;
+		
+		uint8_t* pSrcY = pCurLayer->pCsData[0] + iCsOffsetL;
+		uint8_t* pSrcU = pCurLayer->pCsData[1] + iCsOffsetC;
+		uint8_t* pSrcV = pCurLayer->pCsData[2] + iCsOffsetC;
+		
+		uint8_t* pDecY = pCurLayer->pDec->pData[0] + iDecOffsetL;
+		uint8_t* pDecU = pCurLayer->pDec->pData[1] + iDecOffsetC;
+		uint8_t* pDecV = pCurLayer->pDec->pData[2] + iDecOffsetC;
+
+		WelsMbCopy( pDecY, iDecStrideL, pSrcY, iCsStrideL, 16, 16 );
+		WelsMbCopy( pDecU, iDecStrideC, pSrcU, iCsStrideC, 8, 8 );
+		WelsMbCopy( pDecV, iDecStrideC, pSrcV, iCsStrideC, 8, 8 );
+		
+		return 0;
+	}
+	else if(IS_INTRA(pCurLayer->pMbType[pCurLayer->iMbXyIndex]))
+	{
+		WelsMbIntraPredictionConstruction(pCtx, pCurLayer, 1);
+	}
+	else if ( IS_INTER( pCurLayer->pMbType[pCurLayer->iMbXyIndex] ) ) //InterMB
+	{
+		if ( 0 == pCurLayer->pCbp[pCurLayer->iMbXyIndex] ) //uiCbp==0 include SKIP
+		{
+			WelsMbInterPrediction( pCtx, pCurLayer );
+		}
+		else
+		{
+			WelsMbInterConstruction( pCtx, pCurLayer );
+		}			
+	}
+	else
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, "WelsTargetMbConstruction():::::Unknown MB type: %d\n", pCurLayer->pMbType[pCurLayer->iMbXyIndex] );
+		return -1;
+	}
+	
+	return 0;
+}
+
+void_t WelsChromaDcIdct( int16_t *pBlock )
+{
+    int32_t iStride= 32;
+    int32_t iXStride= 16;
+	int32_t iStride1 = iXStride + iStride;
+	int16_t* pBlk = pBlock;
+    int32_t iA,iB,iC,iD,iE;
+	
+    iA= pBlk[0];
+    iB= pBlk[iXStride];
+    iC= pBlk[iStride];
+    iD= pBlk[iStride1];
+	
+    iE = iA-iB;
+    iA += iB;
+    iB = iC-iD;
+    iC += iD;
+	
+	pBlk[0]= (iA+iC) >> 1;
+    pBlk[iXStride]= (iE+iB) >> 1;
+    pBlk[iStride]= (iA-iC) >> 1;
+    pBlk[iStride1]= (iE-iB) >> 1;
+}
+
+int32_t WelsDecodeSlice(PWelsDecoderContext pCtx, bool_t bFirstSliceInLayer, PNalUnit pNalCur)
+{
+	PDqLayer pCurLayer = pCtx->pCurDqLayer;
+	PFmo pFmo = pCtx->pFmo;
+	int32_t i, iRet;
+	int32_t iNextMbXyIndex, iSliceIdc; 
+
+	PSlice pSlice = &pCurLayer->sLayerInfo.sSliceInLayer;
+	PSliceHeaderExt pSliceHeaderExt = &pSlice->sSliceHeaderExt;
+	PSliceHeader pSliceHeader = &pSliceHeaderExt->sSliceHeader;
+	int32_t iMbX, iMbY;
+	const int32_t kiCountNumMb = pSliceHeader->pSps->uiTotalMbCount; //need to be correct when fmo or multi slice
+	PBitStringAux pBs = pCurLayer->pBitStringAux; 
+	int32_t iUsedBits  = 0;
+	
+	PWelsDecMbCavlcFunc pDecMbCavlcFunc; 
+
+	pSlice->iTotalMbInCurSlice = 0; //initialize at the starting of slice decoding.
+
+	if ( P_SLICE == pSliceHeader->eSliceType ) 
+	{
+		pDecMbCavlcFunc = WelsDecodeMbCavlcPSlice;
+	}
+	else //I_SLICE
+	{
+		pDecMbCavlcFunc = WelsDecodeMbCavlcISlice;
+	}
+
+	if ( pSliceHeader->pPps->bConstainedIntraPredFlag )
+	{
+		pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain1Intra4x4;
+		pCtx->pParseIntra4x4ModeFunc      = ParseIntra4x4ModeConstrain1;
+		pCtx->pParseIntra16x16ModeFunc    = ParseIntra16x16ModeConstrain1;
+	}
+	else
+	{
+		pCtx->pFillInfoCacheIntra4x4Func = WelsFillCacheConstrain0Intra4x4;
+		pCtx->pParseIntra4x4ModeFunc      = ParseIntra4x4ModeConstrain0;
+		pCtx->pParseIntra16x16ModeFunc    = ParseIntra16x16ModeConstrain0;
+	}
+	
+	pCtx->eSliceType = pSliceHeader->eSliceType;
+
+	if (pCurLayer->sLayerInfo.pPps->bEntropyCodingModeFlag == 1)
+	{
+		//CABAC encoding is unsupported yet!
+		return -1;
+	}
+	
+	iNextMbXyIndex = pSliceHeader->iFirstMbInSlice;
+
+	if ( iNextMbXyIndex >= kiCountNumMb )
+	{
+		WelsLog(pCtx, WELS_LOG_ERROR, "WelsDecodeSlice()::iFirstMbInSlice(%d) > pSps->kiTotalMb(%d). ERROR!!! resolution change....\n", 
+			iNextMbXyIndex, kiCountNumMb);
+		pCtx->iErrorCode |= dsNoParamSets;
+		return dsNoParamSets;
+	}	
+
+	iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
+	iMbY = iNextMbXyIndex / pCurLayer->iMbWidth; // error is introduced by multiple slices case, 11/23/2009
+	pSlice->iMbSkipRun = -1;
+	iSliceIdc = (pSliceHeader->iFirstMbInSlice<<7)+pCurLayer->uiLayerDqId;
+	
+	pCurLayer->iMbX =  iMbX;
+	pCurLayer->iMbY = iMbY;
+	pCurLayer->iMbXyIndex = iNextMbXyIndex;
+
+	if(pSliceHeaderExt->bSliceSkipFlag == 1)
+	{
+		for(i=0; i<(int32_t)pSliceHeaderExt->uiNumMbsInSlice; i++)
+		{
+			pCurLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
+		
+			
+			pCurLayer->pResidualPredFlag[iNextMbXyIndex] = 1;		
+			
+			if ( pSliceHeaderExt->sSliceHeader.pPps->uiNumSliceGroups > 1 )
+			{
+				iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
+			}
+			else
+			{
+				++iNextMbXyIndex;
+			}
+
+			iMbX = iNextMbXyIndex%pCurLayer->iMbWidth;
+			iMbY = iNextMbXyIndex%pCurLayer->iMbHeight;
+	
+			pCurLayer->iMbX =  iMbX;
+			pCurLayer->iMbY = iMbY;
+			pCurLayer->iMbXyIndex = iNextMbXyIndex;
+		}
+		return 0;
+	}
+
+	do{
+		pCurLayer->pSliceIdc[iNextMbXyIndex] = iSliceIdc;
+		iRet = pDecMbCavlcFunc( pCtx,  pNalCur );
+
+		if (iRet != ERR_NONE){
+			return iRet;
+		}
+
+		++pSlice->iTotalMbInCurSlice;
+
+		if ( pSliceHeader->pPps->uiNumSliceGroups > 1 )
+		{
+			iNextMbXyIndex = FmoNextMb( pFmo, iNextMbXyIndex );
+		}
+		else
+		{
+			++iNextMbXyIndex;
+		}
+		if ( (-1 == iNextMbXyIndex) || (iNextMbXyIndex >= kiCountNumMb) )	// slice group boundary or end of a frame
+		{
+			break;
+		}
+
+		// check whether there is left bits to read next time in case multiple slices
+		iUsedBits = ((pBs->pCurBuf-pBs->pStartBuf)<<3) - (16-pBs->iLeftBits);
+		if ( iUsedBits == pBs->iBits && 0 >= pCurLayer->sLayerInfo.sSliceInLayer.iMbSkipRun )	// slice boundary
+		{
+			break;
+		}		
+		if ( iUsedBits > pBs->iBits )//When BS incomplete, as long as find it, SHOULD stop decoding to avoid mosaic or crash.
+		{
+			WelsLog( pCtx, WELS_LOG_WARNING, "WelsDecodeSlice()::::pBs incomplete, iUsedBits:%d > pBs->iBits:%d, MUST stop decoding.\n", 
+				iUsedBits, pBs->iBits );
+			return -1;
+		}
+		iMbX = iNextMbXyIndex % pCurLayer->iMbWidth;
+		iMbY = iNextMbXyIndex / pCurLayer->iMbWidth;
+		pCurLayer->iMbX =  iMbX;
+		pCurLayer->iMbY = iMbY;
+		pCurLayer->iMbXyIndex = iNextMbXyIndex;
+	}while(1);
+	
+	return ERR_NONE;
+}
+
+int32_t WelsActualDecodeMbCavlcISlice(PWelsDecoderContext pCtx)
+{	
+	SVlcTable* pVlcTable     = &pCtx->sVlcTable;
+	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
+	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
+	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
+	PSliceHeader pSliceHeader		     = &pSlice->sSliceHeaderExt.sSliceHeader;
+
+	SNeighAvail sNeighAvail;
+
+	int32_t iScanIdxStart = pSlice->sSliceHeaderExt.uiScanIdxStart;
+	int32_t iScanIdxEnd   = pSlice->sSliceHeaderExt.uiScanIdxEnd;	
+
+	int32_t iMbX = pCurLayer->iMbX;
+	int32_t iMbY = pCurLayer->iMbY;
+	int32_t iMbXy = pCurLayer->iMbXyIndex;
+	int32_t iNMbMode, i;
+	uint32_t uiMbType = 0, uiCbp = 0, uiCbpL = 0, uiCbpC = 0;
+
+	FORCE_STACK_ALIGN_1D( uint8_t, pNonZeroCount, 48, 16 );
+	
+	pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
+	pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
+	
+	uiMbType = BsGetUe(pBs);
+	if ( uiMbType > 25 )
+	{
+		return ERR_INFO_INVALID_MB_TYPE;
+	}	
+
+	if ( 25 == uiMbType )
+	{
+		int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
+		int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 
+		
+		int32_t iOffsetL = ( iMbX + iMbY * iDecStrideL ) << 4;
+		int32_t iOffsetC = ( iMbX + iMbY * iDecStrideC ) << 3;
+		
+		uint8_t* pDecY = pCurLayer->pCsData[0] + iOffsetL;
+		uint8_t* pDecU = pCurLayer->pCsData[1] + iOffsetC;
+		uint8_t* pDecV = pCurLayer->pCsData[2] + iOffsetC;
+		
+		uint8_t *pTmpBsBuf;
+		
+		int32_t i;
+		int32_t iCopySizeY  = ( sizeof( uint8_t ) << 4 );
+		int32_t iCopySizeUV = ( sizeof( uint8_t ) << 3 );
+
+		int32_t iIndex = ((-pBs->iLeftBits)>>3) + 2;
+		
+		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA_PCM;
+		
+		//step 1: locating bit-stream pointer [must align into integer byte]	
+		pBs->pCurBuf -= iIndex;
+		
+		//step 2: copy pixel from bit-stream into fdec [reconstruction]		
+		pTmpBsBuf = pBs->pCurBuf;
+		for ( i = 0; i < 16; i++ ) //luma
+		{
+			memcpy( pDecY , pTmpBsBuf, iCopySizeY );
+			pDecY += iDecStrideL;				
+			pTmpBsBuf += 16;
+		}
+		for ( i = 0; i < 8; i++ ) //cb
+		{				
+			memcpy( pDecU, pTmpBsBuf, iCopySizeUV );
+			pDecU += iDecStrideC;				
+			pTmpBsBuf += 8;
+		}
+		for ( i = 0; i < 8; i++ ) //cr
+		{				
+			memcpy( pDecV, pTmpBsBuf, iCopySizeUV );
+			pDecV += iDecStrideC;
+			pTmpBsBuf += 8;
+		}	
+
+		pBs->pCurBuf += 384;
+		InitReadBits( pBs );
+		
+		//step 3: update QP and pNonZeroCount
+		pCurLayer->pLumaQp[iMbXy] = 0;
+		pCurLayer->pChromaQp[iMbXy] = 0;
+		memset( pCurLayer->pNzc[iMbXy], 16, sizeof( pCurLayer->pNzc[iMbXy] ) ); //JVT-x201wcm1.doc, page229, 2009.10.23		
+		return 0;				
+	}
+	else if (0 == uiMbType) //reference to JM
+	{
+		FORCE_STACK_ALIGN_1D( int8_t, pIntraPredMode, 48, 16 );
+		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
+		pCtx->pFillInfoCacheIntra4x4Func( &sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer );
+		if ( pCtx->pParseIntra4x4ModeFunc( &sNeighAvail, pIntraPredMode, pBs, pCurLayer ) )
+		{
+			return -1;
+		}
+
+		//uiCbp
+		uiCbp = BsGetUe(pBs);
+		//G.9.1 Alternative parsing process for coded pBlock pattern
+		if ( uiCbp > 47 ) 
+			return ERR_INFO_INVALID_CBP;
+
+		uiCbp = g_kuiIntra4x4CbpTable[uiCbp];
+
+		pCurLayer->pCbp[iMbXy] = uiCbp;
+		uiCbpC = uiCbp >> 4;
+		uiCbpL = uiCbp & 15;
+	}
+	else //I_PCM exclude, we can ignore it
+	{
+		pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
+		pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType-1) & 3;
+		pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[(uiMbType-1)>>2];
+		uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
+		uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
+		WelsFillCacheNonZeroCount( &sNeighAvail, pNonZeroCount, pCurLayer );
+		if ( pCtx->pParseIntra16x16ModeFunc( &sNeighAvail, pBs, pCurLayer ) )
+		{
+			return -1;
+		}
+	}
+	
+	iNMbMode = BASE_MB;					
+
+	memset(pCurLayer->pScaledTCoeff[iMbXy], 0, 384*sizeof(pCurLayer->pScaledTCoeff[iMbXy][0]));
+	ST32(&pCurLayer->pNzc[iMbXy][0], 0);
+	ST32(&pCurLayer->pNzc[iMbXy][4], 0);
+	ST32(&pCurLayer->pNzc[iMbXy][8], 0);
+	ST32(&pCurLayer->pNzc[iMbXy][12], 0);
+	ST32(&pCurLayer->pNzc[iMbXy][16], 0);
+	ST32(&pCurLayer->pNzc[iMbXy][20], 0);
+
+	if( pCurLayer->pCbp[iMbXy] == 0 && IS_INTRA4x4(pCurLayer->pMbType[iMbXy]))
+	{
+		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
+		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + 
+											pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+
+	}
+
+	if ( pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
+	{
+		int32_t iQpDelta, iId8x8, iId4x4;		
+
+		iQpDelta = BsGetSe(pBs);
+
+        if (iQpDelta > 25 || iQpDelta < -26) //out of iQpDelta range
+		{
+			return ERR_INFO_INVALID_QP;
+		}
+
+		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp + iQpDelta; //update iLastMbQp
+		//refer to JVT-X201wcm1.doc equation(7-35)
+		if ( (unsigned)(pCurLayer->pLumaQp[iMbXy]) > 51 )
+		{
+			if ( pCurLayer->pLumaQp[iMbXy] < 0 )
+			{
+				pCurLayer->pLumaQp[iMbXy] += 52;
+			} 
+			else
+			{
+				pCurLayer->pLumaQp[iMbXy] -= 52;
+			}
+		}
+		//QP should be in the range of [0, 51]
+		if ( pCurLayer->pLumaQp[iMbXy] < 0 || pCurLayer->pLumaQp[iMbXy] > 51 )
+		{
+			return ERR_INFO_INVALID_QP;
+		}
+		pSlice->iLastMbQp = pCurLayer->pLumaQp[iMbXy];
+		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pSlice->iLastMbQp + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+
+
+		BsStartCavlc( pBs );
+
+		if ( MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
+		{
+			//step1: Luma DC
+			if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, 0, 16,
+				g_kuiLumaDcZigzagScan, I16_LUMA_DC, pCurLayer->pScaledTCoeff[iMbXy], iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
+			{
+				return -1;//abnormal
+			}
+			//step2: Luma AC
+			if (uiCbpL)
+			{
+				for (i = 0; i < 16; i++) 
+				{	
+                    if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, i,
+								iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan+ WELS_MAX(iScanIdxStart,1),
+								I16_LUMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (i<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) ) 
+					{
+							return -1;//abnormal
+					}					
+				}
+				ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
+				ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
+				ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
+				ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
+			}
+			else //pNonZeroCount = 0
+			{
+				ST32(&pCurLayer->pNzc[iMbXy][0], 0);
+				ST32(&pCurLayer->pNzc[iMbXy][4], 0);	
+				ST32(&pCurLayer->pNzc[iMbXy][8], 0);
+				ST32(&pCurLayer->pNzc[iMbXy][12], 0);
+			}
+		}
+		else //non-MB_TYPE_INTRA16x16
+		{	
+			for (iId8x8 = 0; iId8x8 < 4; iId8x8++) 
+			{
+				if (uiCbpL & (1 << iId8x8)) 
+				{
+					int32_t iIndex = (iId8x8 << 2);
+					for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
+					{
+						//Luma (DC and AC decoding together)
+						if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
+							iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan+iScanIdxStart, 
+							LUMA_DC_AC, pCurLayer->pScaledTCoeff[iMbXy] + (iIndex<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
+						{
+							return -1;//abnormal
+						}
+						iIndex++;
+					}
+				}
+				else
+				{
+					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)]], 0);
+					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)+2]], 0);
+				}
+			}	
+			ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
+			ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
+			ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
+			ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
+		}
+
+		//chroma 
+		//step1: DC
+		if ( 1 == uiCbpC || 2 == uiCbpC )
+		{	
+			for (i = 0; i < 2; i++) //Cb Cr
+			{			
+				if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs,
+					16 + (i << 2), 4, g_kuiChromaDcScan, CHROMA_DC, pCurLayer->pScaledTCoeff[iMbXy] + 256 + (i<<6),
+					iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) ) 
+				{
+					return -1;//abnormal
+				}
+			}
+		}
+
+		//step2: AC
+		if (2 == uiCbpC)
+		{
+			for (i = 0; i < 2; i++) //Cb Cr
+			{
+				int32_t iIndex = 16 + (i<<2);
+				for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
+				{
+					if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex, 
+						iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1), 
+						CHROMA_AC, pCurLayer->pScaledTCoeff[iMbXy]+(iIndex<<4), iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) )
+					{
+						return -1;//abnormal
+					}
+					iIndex++;
+				}
+			}
+			ST16(&pCurLayer->pNzc[iMbXy][16], LD16(&pNonZeroCount[6+8*1]));
+			ST16(&pCurLayer->pNzc[iMbXy][20], LD16(&pNonZeroCount[6+8*2]));
+			ST16(&pCurLayer->pNzc[iMbXy][18], LD16(&pNonZeroCount[6+8*4]));
+			ST16(&pCurLayer->pNzc[iMbXy][22], LD16(&pNonZeroCount[6+8*5]));
+		}
+		else 
+		{
+			ST16(&pCurLayer->pNzc[iMbXy][16], 0);
+			ST16(&pCurLayer->pNzc[iMbXy][20], 0);
+			ST16(&pCurLayer->pNzc[iMbXy][18], 0);
+			ST16(&pCurLayer->pNzc[iMbXy][22], 0);
+		}
+		BsEndCavlc( pBs ); 
+	}
+	else
+	{
+		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
+	}	
+
+	return 0;
+}
+
+int32_t WelsDecodeMbCavlcISlice(PWelsDecoderContext pCtx, PNalUnit pNalCur)
+{
+	PDqLayer pCurLayer = pCtx->pCurDqLayer;
+	PBitStringAux pBs = pCurLayer->pBitStringAux;	
+	PSliceHeaderExt pSliceHeaderExt = &pCurLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt;
+	int32_t iBaseModeFlag;	
+	int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15
+
+	if( pSliceHeaderExt->bAdaptiveBaseModeFlag == 1)
+	{
+		iBaseModeFlag = BsGetOneBit(pBs);
+	}
+	else
+	{
+		iBaseModeFlag = pSliceHeaderExt->bDefaultBaseModeFlag;
+	}
+    if( !iBaseModeFlag )
+    {
+        iRet = WelsActualDecodeMbCavlcISlice( pCtx);
+    }
+    else
+    {
+        WelsLog( pCtx, WELS_LOG_WARNING, "iBaseModeFlag (%d) != 0, inter-layer prediction not supported.\n", iBaseModeFlag);
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
+    }
+	if ( iRet ) //occur error when parsing, MUST STOP decoding
+	{
+		return iRet;
+	}
+
+	return 0;
+}
+
+int32_t WelsActualDecodeMbCavlcPSlice(PWelsDecoderContext pCtx)
+{
+	SVlcTable* pVlcTable     = &pCtx->sVlcTable;
+	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
+	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
+	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
+	PSliceHeader pSliceHeader		     = &pSlice->sSliceHeaderExt.sSliceHeader;
+
+	SNeighAvail sNeighAvail;
+
+	int32_t iScanIdxStart = pSlice->sSliceHeaderExt.uiScanIdxStart;
+	int32_t iScanIdxEnd   = pSlice->sSliceHeaderExt.uiScanIdxEnd;	
+
+	int32_t iMbX = pCurLayer->iMbX;
+	int32_t iMbY = pCurLayer->iMbY;
+	int32_t iMbXy = pCurLayer->iMbXyIndex;
+
+	int32_t iNMbMode, i;
+	uint32_t uiMbType = 0, uiCbp = 0, uiCbpL = 0, uiCbpC = 0;
+
+	FORCE_STACK_ALIGN_1D( uint8_t, pNonZeroCount, 48, 16 );
+	pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;//2009.10.23
+	
+	uiMbType = BsGetUe(pBs);	
+	if (uiMbType < 5) //inter MB type
+	{
+		int16_t iMotionVector[LIST_A][30][MV_A];	
+
+		int8_t	iRefIndex[LIST_A][30];
+		pCurLayer->pMbType[iMbXy] = g_ksInterMbTypeInfo[uiMbType].iType;
+		WelsFillCacheInter( &sNeighAvail, pNonZeroCount, iMotionVector, iRefIndex, pCurLayer );
+		if ( ParseInterInfo(pCtx, iMotionVector, iRefIndex, pBs) )
+		{
+			return -1;//abnormal
+		}
+
+		if( pSlice->sSliceHeaderExt.bAdaptiveResidualPredFlag ==1 )
+		{
+			pCurLayer->pResidualPredFlag[iMbXy] =  BsGetOneBit(pBs);
+		}
+		else
+		{
+			pCurLayer->pResidualPredFlag[iMbXy] = pSlice->sSliceHeaderExt.bDefaultResidualPredFlag;
+		}
+
+		if(pCurLayer->pResidualPredFlag[iMbXy] == 0)
+		{
+			iNMbMode = BASE_MB;
+			pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
+		}
+		else 
+		{
+            WelsLog(pCtx, WELS_LOG_WARNING, "residual_pred_flag = 1 not supported.\n");
+            return -1;
+		}
+	}
+	else //intra MB type
+	{
+		uiMbType -= 5;
+		if ( uiMbType > 25 )
+		{
+			return ERR_INFO_INVALID_MB_TYPE;
+		}
+		
+		if ( 25 == uiMbType )
+		{
+			int32_t iDecStrideL = pCurLayer->pDec->iLinesize[0]; 
+			int32_t iDecStrideC = pCurLayer->pDec->iLinesize[1]; 
+		
+			int32_t iOffsetL = ( iMbX + iMbY * iDecStrideL ) << 4;
+			int32_t iOffsetC = ( iMbX + iMbY * iDecStrideC ) << 3;
+		
+			uint8_t* pDecY = pCurLayer->pCsData[0] + iOffsetL;
+			uint8_t* pDecU = pCurLayer->pCsData[1] + iOffsetC;
+			uint8_t* pDecV = pCurLayer->pCsData[2] + iOffsetC;
+		
+			uint8_t *pTmpBsBuf;
+		
+			int32_t i;
+			int32_t iCopySizeY  = ( sizeof( uint8_t ) << 4 );
+			int32_t iCopySizeUV = ( sizeof( uint8_t ) << 3 );
+
+			int32_t iIndex = ((-pBs->iLeftBits)>>3) + 2;
+			
+			pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA_PCM;
+
+			//step 1: locating bit-stream pointer [must align into integer byte]
+			pBs->pCurBuf -= iIndex;
+		
+		    //step 2: copy pixel from bit-stream into fdec [reconstruction]			
+			pTmpBsBuf = pBs->pCurBuf;
+			for ( i = 0; i < 16; i++ ) //luma
+			{
+				memcpy( pDecY , pTmpBsBuf, iCopySizeY );
+				pDecY += iDecStrideL;				
+				pTmpBsBuf += 16;
+			}
+			
+			for ( i = 0; i < 8; i++ ) //cb
+			{				
+				memcpy( pDecU, pTmpBsBuf, iCopySizeUV );
+				pDecU += iDecStrideC;				
+				pTmpBsBuf += 8;
+			}
+			for ( i = 0; i < 8; i++ ) //cr
+			{				
+				memcpy( pDecV, pTmpBsBuf, iCopySizeUV );
+				pDecV += iDecStrideC;
+				pTmpBsBuf += 8;
+			}		
+
+			pBs->pCurBuf += 384;
+			InitReadBits( pBs );
+		
+		    //step 3: update QP and pNonZeroCount
+			pCurLayer->pLumaQp[iMbXy] = 0;
+			pCurLayer->pChromaQp[iMbXy] = 0;		
+			ST32(&pCurLayer->pNzc[iMbXy][0], 0);
+			ST32(&pCurLayer->pNzc[iMbXy][4], 0);
+			ST32(&pCurLayer->pNzc[iMbXy][8], 0);
+			ST32(&pCurLayer->pNzc[iMbXy][12], 0);
+			return 0;
+		}
+		else
+		{
+			if (0 == uiMbType) 
+			{
+				FORCE_STACK_ALIGN_1D( int8_t, pIntraPredMode, 48, 16 );
+				pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA4x4;
+				pCtx->pFillInfoCacheIntra4x4Func( &sNeighAvail, pNonZeroCount, pIntraPredMode, pCurLayer );
+				if ( pCtx->pParseIntra4x4ModeFunc( &sNeighAvail, pIntraPredMode, pBs, pCurLayer ) )
+				{
+					return -1;
+				}
+				iNMbMode = BASE_MB;
+			}
+			else //I_PCM exclude, we can ignore it
+			{
+				pCurLayer->pMbType[iMbXy] = MB_TYPE_INTRA16x16;
+				pCurLayer->pIntraPredMode[iMbXy][7] = (uiMbType-1) & 3;
+				pCurLayer->pCbp[iMbXy] = g_kuiI16CbpTable[(uiMbType-1)>>2];
+				uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
+				uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
+				WelsFillCacheNonZeroCount( &sNeighAvail, pNonZeroCount, pCurLayer );
+				if ( pCtx->pParseIntra16x16ModeFunc( &sNeighAvail, pBs, pCurLayer ) )
+				{
+					return -1;
+				}
+				iNMbMode = BASE_MB;
+			}
+		}
+	}	
+	
+	if ( MB_TYPE_INTRA16x16 != pCurLayer->pMbType[iMbXy] ) 
+	{
+		uiCbp = BsGetUe(pBs);	
+		{
+			if ( uiCbp > 47 ) 
+				return ERR_INFO_INVALID_CBP;
+
+			if (MB_TYPE_INTRA4x4 == pCurLayer->pMbType[iMbXy]) 
+			{
+				uiCbp = g_kuiIntra4x4CbpTable[uiCbp];
+			}
+			else //inter
+				uiCbp = g_kuiInterCbpTable[uiCbp];
+		}
+
+		pCurLayer->pCbp[iMbXy] = uiCbp;
+		uiCbpC = pCurLayer->pCbp[iMbXy] >> 4;
+		uiCbpL = pCurLayer->pCbp[iMbXy] & 15;
+	}		
+
+	if(iNMbMode == BASE_MB)
+	{
+		pCtx->sBlockFunc.pWelsBlockZero16x16Func(pCurLayer->pScaledTCoeff[iMbXy], 16);
+		pCtx->sBlockFunc.pWelsBlockZero8x8Func(pCurLayer->pScaledTCoeff[iMbXy]+256, 8);
+		pCtx->sBlockFunc.pWelsBlockZero8x8Func(pCurLayer->pScaledTCoeff[iMbXy]+256+64, 8);
+
+		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
+		if( pCurLayer->pCbp[iMbXy] == 0 && !IS_INTRA16x16(pCurLayer->pMbType[iMbXy]) && !IS_I_BL(pCurLayer->pMbType[iMbXy]))
+		{
+			pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
+			pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+		}
+	}	
+
+	if ( pCurLayer->pCbp[iMbXy] || MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] )
+	{
+		int32_t iQpDelta, iId8x8, iId4x4;	
+		
+		iQpDelta = BsGetSe(pBs);
+
+        if (iQpDelta > 25 || iQpDelta < -26) //out of iQpDelta range
+		{
+			return ERR_INFO_INVALID_QP;
+		}
+
+		pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp + iQpDelta; //update iLastMbQp
+		//refer to JVT-X201wcm1.doc equation(7-35)	
+		if ( (unsigned)(pCurLayer->pLumaQp[iMbXy]) > 51 )
+		{
+			if ( pCurLayer->pLumaQp[iMbXy] < 0 )
+			{
+				pCurLayer->pLumaQp[iMbXy] += 52;
+			} 
+			else
+			{
+				pCurLayer->pLumaQp[iMbXy] -= 52;
+			}
+		}
+		//QP should be in the range of [0, 51]
+		if ( pCurLayer->pLumaQp[iMbXy] < 0 || pCurLayer->pLumaQp[iMbXy] > 51 )
+		{
+			return ERR_INFO_INVALID_QP;
+		}
+		pSlice->iLastMbQp = pCurLayer->pLumaQp[iMbXy];
+		pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pSlice->iLastMbQp + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+
+		BsStartCavlc( pBs );
+
+		if ( MB_TYPE_INTRA16x16 == pCurLayer->pMbType[iMbXy] ) 
+		{
+			//step1: Luma DC
+			if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, 0, 16, g_kuiLumaDcZigzagScan, 
+				I16_LUMA_DC, pCurLayer->pScaledTCoeff[iMbXy], iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
+			{
+				return -1;//abnormal
+			}
+			//step2: Luma AC
+			if (uiCbpL)
+			{
+				for (i = 0; i < 16; i++) 
+				{
+                    if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount,pBs, i,
+							iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1),
+							I16_LUMA_AC, pCurLayer->pScaledTCoeff[iMbXy] + (i<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
+					{
+						return -1;//abnormal
+					}
+				}
+				ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
+				ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
+				ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
+				ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
+			}
+			else //pNonZeroCount = 0
+			{
+				ST32(&pCurLayer->pNzc[iMbXy][0], 0);
+				ST32(&pCurLayer->pNzc[iMbXy][4], 0);
+				ST32(&pCurLayer->pNzc[iMbXy][8], 0);
+				ST32(&pCurLayer->pNzc[iMbXy][12], 0);
+			}
+		}
+		else //non-MB_TYPE_INTRA16x16
+		{	
+			for (iId8x8 = 0; iId8x8 < 4; iId8x8++) 
+			{
+				if (uiCbpL & (1 << iId8x8)) 
+				{
+					int32_t iIndex = (iId8x8 << 2);
+					for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
+					{
+						//Luma (DC and AC decoding together)
+						if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
+							iScanIdxEnd - iScanIdxStart + 1, g_kuiZigzagScan+iScanIdxStart, LUMA_DC_AC,
+							pCurLayer->pScaledTCoeff[iMbXy] + (iIndex<<4), iNMbMode, pCurLayer->pLumaQp[iMbXy], pCtx) )
+						{
+							return -1;//abnormal
+						}
+						iIndex++;
+					}
+				}
+				else
+				{					
+					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[iId8x8<<2]],0);
+					ST16(&pNonZeroCount[g_kuiCacheNzcScanIdx[(iId8x8<<2)+2]],0);
+				}
+			}	
+			ST32(&pCurLayer->pNzc[iMbXy][0], LD32(&pNonZeroCount[1+8*1]));
+			ST32(&pCurLayer->pNzc[iMbXy][4], LD32(&pNonZeroCount[1+8*2]));
+			ST32(&pCurLayer->pNzc[iMbXy][8], LD32(&pNonZeroCount[1+8*3]));
+			ST32(&pCurLayer->pNzc[iMbXy][12], LD32(&pNonZeroCount[1+8*4]));
+        }
+
+		
+		//chroma 
+		//step1: DC
+		if ( 1 == uiCbpC || 2 == uiCbpC )
+		{	
+			for (i = 0; i < 2; i++) //Cb Cr
+			{	
+				if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs,
+					16 + (i << 2), 4, g_kuiChromaDcScan, CHROMA_DC, pCurLayer->pScaledTCoeff[iMbXy] + 256 + (i<<6),
+					iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) ) 
+				{
+					return -1;//abnormal
+				}
+			}
+		}
+		else
+		{
+		}
+		//step2: AC
+		if (2 == uiCbpC)
+		{
+			for (i = 0; i < 2; i++) //Cb Cr
+			{
+				int32_t iIndex= 16 + (i<<2);
+				for (iId4x4 = 0; iId4x4 < 4; iId4x4++) 
+				{
+					if ( WelsResidualBlockCavlc( pVlcTable, pNonZeroCount, pBs, iIndex,
+						iScanIdxEnd - WELS_MAX(iScanIdxStart, 1) + 1, g_kuiZigzagScan + WELS_MAX(iScanIdxStart,1),
+						CHROMA_AC, pCurLayer->pScaledTCoeff[iMbXy]+(iIndex<<4), iNMbMode, pCurLayer->pChromaQp[iMbXy], pCtx) )
+					{
+						return -1;//abnormal
+					}
+					iIndex++;
+				}
+			}
+			ST16(&pCurLayer->pNzc[iMbXy][16], LD16(&pNonZeroCount[6+8*1]));
+			ST16(&pCurLayer->pNzc[iMbXy][20], LD16(&pNonZeroCount[6+8*2]));
+			ST16(&pCurLayer->pNzc[iMbXy][18], LD16(&pNonZeroCount[6+8*4]));
+			ST16(&pCurLayer->pNzc[iMbXy][22], LD16(&pNonZeroCount[6+8*5]));
+		}
+		else 
+		{
+			ST32(&pCurLayer->pNzc[iMbXy][16], 0);
+			ST32(&pCurLayer->pNzc[iMbXy][20], 0); 
+		}
+		BsEndCavlc( pBs );
+	}
+	else
+	{
+		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
+	}	
+
+	return 0;
+}
+
+int32_t WelsDecodeMbCavlcPSlice(PWelsDecoderContext pCtx, PNalUnit pNalCur)
+{
+	PDqLayer pCurLayer		 = pCtx->pCurDqLayer;
+	PBitStringAux pBs		 = pCurLayer->pBitStringAux;
+	PSlice pSlice			 = &pCurLayer->sLayerInfo.sSliceInLayer;
+	PSliceHeader pSliceHeader		    = &pSlice->sSliceHeaderExt.sSliceHeader;
+
+	int32_t iMbXy = pCurLayer->iMbXyIndex;
+	int32_t iBaseModeFlag, i;
+	int32_t iRet = 0; //should have the return value to indicate decoding error or not, It's NECESSARY--2010.4.15
+
+	if (-1 == pSlice->iMbSkipRun) 
+	{
+		pSlice->iMbSkipRun = BsGetUe(pBs);
+		if ( -1 == pSlice->iMbSkipRun )
+		{
+			return -1;
+		}
+		
+	}
+	if (pSlice->iMbSkipRun--)
+	{
+		int16_t iMv[2] = {0};
+		
+		pCurLayer->pMbType[iMbXy] = MB_TYPE_SKIP;
+		ST32(&pCurLayer->pNzc[iMbXy][0], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][4], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][8], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][12], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][16], 0);
+		ST32(&pCurLayer->pNzc[iMbXy][20], 0);
+		
+		pCurLayer->pInterPredictionDoneFlag[iMbXy] = 0;
+		memset(pCurLayer->pRefIndex[0][iMbXy], 0, sizeof(int8_t) * 16);
+		
+		//predict iMv			
+		PredPSkipMvFromNeighbor( pCurLayer, iMv );
+		for (i = 0; i < 16; i++)
+		{
+			ST32( pCurLayer->pMv[0][iMbXy][i], *(uint32_t*)iMv );
+		}
+		
+		if(!pSlice->sSliceHeaderExt.bDefaultResidualPredFlag)
+		{
+			memset(pCurLayer->pScaledTCoeff[iMbXy], 0, 384*sizeof(int16_t));
+		}
+
+		//reset rS
+		if(!pSlice->sSliceHeaderExt.bDefaultResidualPredFlag ||
+			(pNalCur->sNalHeaderExt.uiQualityId==0 && pNalCur->sNalHeaderExt.uiDependencyId==0))
+		{
+			pCurLayer->pLumaQp[iMbXy] = pSlice->iLastMbQp;
+			pCurLayer->pChromaQp[iMbXy] = g_kuiChromaQp[WELS_CLIP3(pCurLayer->pLumaQp[iMbXy] + pSliceHeader->pPps->iChromaQpIndexOffset, 0, 51)];
+		}
+
+		pCurLayer->pCbp[iMbXy] = 0;
+
+		return 0;
+	}
+
+	if(	pSlice->sSliceHeaderExt.bAdaptiveBaseModeFlag == 1)
+	{
+		iBaseModeFlag = BsGetOneBit(pBs);
+	}
+	else
+	{
+		iBaseModeFlag = pSlice->sSliceHeaderExt.bDefaultBaseModeFlag;
+	}
+    if( !iBaseModeFlag )
+    {
+        iRet = WelsActualDecodeMbCavlcPSlice( pCtx );
+    }
+    else
+    {
+        WelsLog( pCtx, WELS_LOG_WARNING, "iBaseModeFlag (%d) != 0, inter-layer prediction not supported.\n", iBaseModeFlag);
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
+    }
+	if ( iRet ) //occur error when parsing, MUST STOP decoding
+	{
+		return iRet;
+	}
+
+	return 0;
+}
+
+void_t WelsBlockInit(int16_t* pBlock, int32_t iWidth, int32_t iHeight, int32_t iStride, uint8_t uiVal)
+{
+	int32_t i;
+	int16_t* pDst = pBlock;
+	
+	for(i=0; i<iHeight; i++)
+	{
+		memset(pDst, uiVal, iWidth*sizeof(int16_t));
+		pDst += iStride;
+	}
+}
+
+void_t WelsBlockFuncInit(SBlockFunc  * pFunc,  int32_t iCpu)
+{
+	pFunc->pWelsBlockZero16x16Func		= WelsBlockZero16x16_c;
+	pFunc->pWelsBlockZero8x8Func	    = WelsBlockZero8x8_c;
+	pFunc->pWelsSetNonZeroCountFunc	    = SetNonZeroCount_c;
+
+#ifdef  X86_ASM
+	if( iCpu & WELS_CPU_SSE2 ){
+		pFunc->pWelsBlockZero16x16Func		= WelsResBlockZero16x16_sse2;
+		pFunc->pWelsBlockZero8x8Func	    = WelsResBlockZero8x8_sse2;
+	}
+#endif
+}
+void_t WelsBlockZero16x16_c(int16_t * pBlock, int32_t iStride)
+{
+    WelsBlockInit(pBlock,16,16,iStride,0);
+}
+
+void_t WelsBlockZero8x8_c(int16_t * pBlock, int32_t iStride)
+{
+	WelsBlockInit(pBlock,8,8,iStride,0);
+}
+
+void_t SetNonZeroCount_c(int16_t* pBlock, int8_t* pNonZeroCount)
+{
+    int32_t i;
+    int32_t iIndex;
+
+	for( i=0;i<24;i++ ){
+        iIndex = g_kuiMbNonZeroCountIdx[i];
+	    pNonZeroCount[iIndex] = !!pNonZeroCount[iIndex];
+	}
+}
+
+} // namespace WelsDec
--- /dev/null
+++ b/codec/decoder/core/src/decoder.cpp
@@ -1,0 +1,850 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	decoder.c
+ *
+ * \brief	Interfaces implementation introduced in decoder system architecture
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include "macros.h"
+#include "codec_def.h"
+#include "decoder.h"
+#include "error_code.h"
+#include "cpu.h"
+#include "cpu_core.h"
+#include "au_parser.h"
+#include "utils.h"
+#include "nal_prefix.h"
+#include "dec_frame.h"
+#include "pic_queue.h"
+#include "vlc_decoder.h"
+#include "get_intra_predictor.h"
+#include "rec_mb.h"
+#include "mc.h"
+#include "decode_mb_aux.h"
+#include "manage_dec_ref.h"
+#include "codec_app_def.h"
+#include "decoder_core.h"
+#include "deblocking.h"
+#include "expand_pic.h"
+#include "decode_slice.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+#include "mem_align.h"
+
+namespace WelsDec {
+
+extern PPicture AllocPicture( PWelsDecoderContext pCtx,const int32_t kiPicWidth, const int32_t kiPicHeight );
+
+extern void_t FreePicture( PPicture pPic );
+
+#ifdef WORDS_BIGENDIAN
+inline void_t GetValueOf4Bytes( uint8_t* pDstNal, int32_t iDdstIdx )
+{
+	pDstNal[0] = (iDdstIdx & 0xff000000) >> 24;
+	pDstNal[1] = (iDdstIdx & 0xff0000) >> 16;
+	pDstNal[2] = (iDdstIdx & 0xff00) >> 8;
+	pDstNal[3] = (iDdstIdx & 0xff);
+}
+#else //WORDS_BIGENDIAN
+inline void_t GetValueOf4Bytes( uint8_t* pDstNal, int32_t iDdstIdx )
+{
+	pDstNal[0] = (iDdstIdx & 0xff);
+	pDstNal[1] = (iDdstIdx & 0xff00) >> 8;
+	pDstNal[2] = (iDdstIdx & 0xff0000) >> 16;
+	pDstNal[3] = (iDdstIdx & 0xff000000) >> 24;
+}
+#endif //WORDS_BIGENDIAN
+
+static int32_t CreatePicBuff(PWelsDecoderContext pCtx, PPicBuff *ppPicBuf, const int32_t kiSize, const int32_t kiPicWidth, const int32_t kiPicHeight)
+{
+	PPicBuff pPicBuf = NULL;
+	int32_t iPicIdx = 0;
+	if (kiSize <= 0 || kiPicWidth <= 0 || kiPicHeight <= 0)
+	{
+		return 1;
+	}
+
+	pPicBuf	= (PPicBuff)WelsMalloc( sizeof(SPicBuff), "PPicBuff" );
+
+	if ( NULL == pPicBuf )
+	{
+		return 1;
+	}
+
+	pPicBuf->ppPic = (PPicture *)WelsMalloc(kiSize * sizeof(PPicture), "PPicture*");
+
+	if ( NULL == pPicBuf->ppPic )
+	{
+		return 1;
+	}
+	for (iPicIdx = 0; iPicIdx < kiSize; ++ iPicIdx)
+	{	
+		PPicture pPic = AllocPicture( pCtx, kiPicWidth, kiPicHeight );
+		if ( NULL == pPic )
+		{
+			return 1;
+		}
+		pPicBuf->ppPic[iPicIdx] = pPic;
+	}
+
+	// initialize context in queue
+	pPicBuf->iCapacity	 = kiSize;	
+	pPicBuf->iCurrentIdx = 0;
+	*ppPicBuf			 = pPicBuf;
+
+	return 0;
+}
+
+static void_t DestroyPicBuff( PPicBuff *ppPicBuf )
+{
+	PPicBuff pPicBuf = NULL;
+
+	if ( NULL == ppPicBuf || NULL == *ppPicBuf )
+		return;
+
+	pPicBuf = *ppPicBuf;
+	while(pPicBuf->ppPic != NULL)
+	{
+		int32_t iPicIdx = 0;
+		while (iPicIdx < pPicBuf->iCapacity)
+		{
+			PPicture pPic = pPicBuf->ppPic[iPicIdx];
+			if(pPic != NULL)
+			{
+				FreePicture( pPic );
+			}	
+			pPic = NULL;
+			++ iPicIdx;
+		}
+
+		WelsFree(pPicBuf->ppPic, "pPicBuf->queue");
+
+		pPicBuf->ppPic	= NULL;
+	}
+	pPicBuf->iCapacity	= 0;
+	pPicBuf->iCurrentIdx= 0;
+
+	WelsFree( pPicBuf, "pPicBuf" );
+
+	pPicBuf = NULL;
+	*ppPicBuf = NULL;
+}
+/*
+ * fill data fields in default for decoder context
+ */
+void_t WelsDecoderDefaults( PWelsDecoderContext pCtx )
+{
+    int32_t iCpuCores               = 1;
+	memset( pCtx, 0, sizeof(SWelsDecoderContext) );	// fill zero first
+
+	pCtx->pArgDec                   = NULL;
+
+	pCtx->iOutputColorFormat		= videoFormatI420;	// yuv in default
+	pCtx->bHaveGotMemory			= false;	// not ever request memory blocks for decoder context related
+	pCtx->uiCpuFlag					= 0;
+	
+	pCtx->bAuReadyFlag				= 0; // au data is not ready
+	
+
+	g_uiCacheLineSize				= 16;
+#if defined(X86_ASM)
+	pCtx->uiCpuFlag = WelsCPUFeatureDetect(&iCpuCores);
+#ifdef HAVE_CACHE_LINE_ALIGN
+	if ( pCtx->uiCpuFlag & WELS_CPU_CACHELINE_64 )
+	{
+		g_uiCacheLineSize	= 64;
+	}
+	else if ( pCtx->uiCpuFlag & WELS_CPU_CACHELINE_32 )
+	{
+		g_uiCacheLineSize	= 32;
+	}
+#endif//HAVE_CACHE_LINE_ALIGN
+#endif//X86_ASM	
+
+	pCtx->iImgWidthInPixel		= 0;
+	pCtx->iImgHeightInPixel		= 0;		// alloc picture data when picture size is available
+
+	pCtx->iFrameNum				= -1;
+	pCtx->iPrevFrameNum			= -1;
+	pCtx->iErrorCode			= ERR_NONE;
+	
+	pCtx->pDec					= NULL;
+
+	WelsResetRefPic(pCtx);
+	
+	pCtx->iActiveFmoNum			= 0;
+
+	pCtx->pPicBuff[LIST_0]		= NULL;
+	pCtx->pPicBuff[LIST_1]		= NULL;
+
+	pCtx->bAvcBasedFlag			= true;
+
+}
+
+/*
+ *	destory_mb_blocks
+ */
+
+
+/*
+ *	get size of reference picture list in target layer incoming, = (iNumRefFrames x 2)
+ */
+static inline int32_t GetTargetRefListSize( PWelsDecoderContext pCtx )
+{	
+	bool_t  *pSubsetSpsAvail= &pCtx->bSubspsAvailFlags[0];
+	bool_t  *pSpsAvail		= &pCtx->bSpsAvailFlags[0];
+	int32_t iSubsetIdx		= -1;
+	int32_t iSpsIdx			= -1;
+	bool_t  bExistSubsetSps = false;
+	int32_t bExistSps		= false;
+	int32_t iPos			= MAX_SPS_COUNT - 1;
+	int32_t iNumRefFrames	= 0;
+
+	while (iPos >= 0)
+	{
+		if ( pSubsetSpsAvail[iPos] )
+		{
+			bExistSubsetSps	= true;
+			iSubsetIdx		= iPos;
+			break;
+		}
+		-- iPos;
+	}
+
+	if ( !bExistSubsetSps )
+	{
+		iPos = MAX_SPS_COUNT - 1;
+		while (iPos >= 0)
+		{
+			if ( pSpsAvail[iPos] )
+			{
+				bExistSps	= true;
+				iSpsIdx		= iPos;
+				break;
+			}
+			-- iPos;
+		}
+	}
+
+	if ( !(bExistSubsetSps || bExistSps) )
+	{
+		iNumRefFrames = MAX_REF_PIC_COUNT;
+	}
+	else
+	{
+		PSps pSps = bExistSubsetSps ? (&pCtx->sSubsetSpsBuffer[iSubsetIdx].sSps) : (&pCtx->sSpsBuffer[iSpsIdx]);
+		
+        iNumRefFrames	= (pSps->iNumRefFrames ) + 1;
+	}
+
+	if ( 0 == iNumRefFrames )
+        iNumRefFrames	= (MIN_REF_PIC_COUNT);
+	
+#ifdef LONG_TERM_REF
+	//pic_queue size minimum set 2
+	if (iNumRefFrames <2)
+	{
+		iNumRefFrames = 2;
+	}
+#endif
+
+	return iNumRefFrames;
+}
+
+/*
+ *	request memory blocks for decoder avc part
+ */
+int32_t WelsRequestMem( PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight )
+{
+	const int32_t kiPicWidth	= kiMbWidth << 4;
+	const int32_t kiPicHeight	= kiMbHeight << 4;
+	int32_t iErr = ERR_NONE;
+
+	int32_t iListIdx			= 0;	//, mb_blocks	= 0;
+	int32_t	iPicQueueSize		= 0;	// adaptive size of picture queue, = (pSps->iNumRefFrames x 2)
+	bool_t  bNeedChangePicQueue	= true;
+	
+	WELS_VERIFY_RETURN_IF( ERR_INFO_INVALID_PARAM, ( NULL == pCtx || kiPicWidth <= 0 || kiPicHeight <= 0 ) )	
+
+	// Fixed the issue about different gop size over last, 5/17/2010
+	// get picture queue size currently
+	iPicQueueSize	= GetTargetRefListSize( pCtx );	// adaptive size of picture queue, = (pSps->iNumRefFrames x 2)
+	pCtx->iPicQueueNumber = iPicQueueSize;
+	if ( pCtx->pPicBuff[LIST_0] != NULL && pCtx->pPicBuff[LIST_0]->iCapacity == iPicQueueSize )	// comparing current picture queue size requested and previous allocation picture queue
+		bNeedChangePicQueue	= false;
+	// HD based pic buffer need consider memory size consumed when switch from 720p to other lower size
+	WELS_VERIFY_RETURN_IF( ERR_NONE, pCtx->bHaveGotMemory && ( kiPicWidth == pCtx->iImgWidthInPixel && kiPicHeight == pCtx->iImgHeightInPixel ) && (!bNeedChangePicQueue) )	// have same scaled buffer
+
+	// sync update pRefList
+	WelsResetRefPic( pCtx );	// added to sync update ref list due to pictures are free
+	
+	// for Recycled_Pic_Queue
+	for ( iListIdx = LIST_0; iListIdx < LIST_A; ++ iListIdx )
+	{
+ 		PPicBuff *ppPic = &pCtx->pPicBuff[iListIdx];
+ 		if ( NULL != ppPic && NULL != *ppPic )
+ 		{
+ 			DestroyPicBuff( ppPic );			
+ 		}
+	}
+	
+	// currently only active for LIST_0 due to have no B frames
+	iErr = CreatePicBuff( pCtx, &pCtx->pPicBuff[LIST_0], iPicQueueSize, kiPicWidth, kiPicHeight );
+	if ( iErr != ERR_NONE )
+		return iErr;	
+	
+	
+	pCtx->iImgWidthInPixel	= kiPicWidth;	// target width of image to be reconstruted while decoding
+	pCtx->iImgHeightInPixel	= kiPicHeight;	// target height of image to be reconstruted while decoding
+
+	pCtx->bHaveGotMemory	= true;			// global memory for decoder context related is requested
+	pCtx->pDec		        = NULL;			// need prefetch a new pic due to spatial size changed
+	return ERR_NONE;
+}
+
+/*
+ *	free memory blocks in avc
+ */
+void_t WelsFreeMem( PWelsDecoderContext pCtx )
+{
+	int32_t iListIdx = 0;
+	
+	/* TODO: free memory blocks introduced in avc */
+	ResetFmoList( pCtx );
+
+	WelsResetRefPic( pCtx );
+
+	// for sPicBuff
+	for ( iListIdx = LIST_0; iListIdx < LIST_A; ++ iListIdx )
+	{
+		PPicBuff *pPicBuff = &pCtx->pPicBuff[iListIdx];
+		if ( NULL != pPicBuff && NULL != *pPicBuff )
+		{
+			DestroyPicBuff( pPicBuff );			
+		}
+	}	
+
+	// added for safe memory
+	pCtx->iImgWidthInPixel	= 0;
+	pCtx->iImgHeightInPixel = 0;
+	pCtx->bHaveGotMemory	= false;
+	
+}
+
+/*!
+ * \brief	Open decoder	
+ */
+void_t WelsOpenDecoder( PWelsDecoderContext pCtx )
+{
+	// function pointers
+	//initial MC function pointer--
+	InitMcFunc(&(pCtx->sMcFunc), pCtx->uiCpuFlag);
+
+    InitExpandPictureFunc(&(pCtx->sExpandPicFunc), pCtx->uiCpuFlag);
+	AssignFuncPointerForRec(pCtx);
+	
+	// vlc tables
+	InitVlcTable(&pCtx->sVlcTable);
+
+	// startup memory
+	if ( ERR_NONE != WelsInitMemory( pCtx ) )
+		return;	
+
+	pCtx->iMaxWidthInSps	= 0;
+	pCtx->iMaxHeightInSps	= 0;
+#ifdef LONG_TERM_REF
+	pCtx->bParamSetsLostFlag = true;
+#else
+	pCtx->bReferenceLostAtT0Flag	= true;	// should be true to waiting IDR at incoming AU bits following, 6/4/2010
+#endif //LONG_TERM_REF
+}
+
+/*!
+ * \brief	Close decoder	
+ */
+void_t WelsCloseDecoder( PWelsDecoderContext pCtx )
+{
+	WelsFreeMem( pCtx );
+	
+	WelsFreeMemory( pCtx );
+
+	UninitialDqLayersContext( pCtx );
+
+#ifdef LONG_TERM_REF
+	pCtx->bParamSetsLostFlag       = false;
+#else
+	pCtx->bReferenceLostAtT0Flag = false;
+#endif
+}
+
+/*!
+ * \brief	configure decoder parameters	
+ */
+int32_t DecoderConfigParam ( PWelsDecoderContext pCtx, const void_t* kpParam )
+{
+	if ( NULL == pCtx || NULL == kpParam )
+		return 1;
+
+	pCtx->pParam	= (SDecodingParam *)WelsMalloc( sizeof(SDecodingParam), "SDecodingParam" );
+
+	if ( NULL == pCtx->pParam )
+		return 1;
+
+	memcpy( pCtx->pParam, kpParam, sizeof(SDecodingParam) );
+	pCtx->iOutputColorFormat	= pCtx->pParam->iOutputColorFormat;
+	pCtx->bErrorResilienceFlag	= pCtx->pParam->uiEcActiveFlag ? true : false;
+
+	if ( VIDEO_BITSTREAM_SVC == pCtx->pParam->sVideoProperty.eVideoBsType ||
+		 VIDEO_BITSTREAM_AVC == pCtx->pParam->sVideoProperty.eVideoBsType )
+	{
+		pCtx->eVideoType = pCtx->pParam->sVideoProperty.eVideoBsType;
+	}
+	else
+	{
+		pCtx->eVideoType = VIDEO_BITSTREAM_DEFAULT;
+	}
+
+	WelsLog(pCtx, WELS_LOG_INFO, "eVideoType: %d\n", pCtx->eVideoType);
+
+	return 0;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	Initialize Wels decoder parameters and memory 
+ *
+ * \param 	pCtx input context to be initialized at first stage 
+ *
+ * \return	0 - successed
+ * \return	1 - failed
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+int32_t WelsInitDecoder( PWelsDecoderContext pCtx, void_t * pTraceHandle, PWelsLogCallbackFunc pLog )
+{
+	if ( pCtx == NULL ){
+		return ERR_INFO_INVALID_PTR;
+	}
+
+	// default
+	WelsDecoderDefaults( pCtx );	
+
+	pCtx->pTraceHandle = pTraceHandle;
+
+	g_pLog = pLog;
+
+	// open decoder
+	WelsOpenDecoder( pCtx );
+	
+	// decode mode setting 
+	pCtx->iDecoderMode = SW_MODE;
+	pCtx->iSetMode = AUTO_MODE;
+	pCtx->iDecoderOutputProperty = BUFFER_HOST;
+	pCtx->iModeSwitchType = 0; // 0: do not do mode switch
+
+
+	return ERR_NONE;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	Uninitialize Wels decoder parameters and memory
+ *
+ * \param 	pCtx input context to be uninitialized at release stage 
+ *
+ * \return	NONE
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+void_t WelsEndDecoder( PWelsDecoderContext pCtx )
+{
+	// close decoder
+	WelsCloseDecoder( pCtx );	
+}
+
+void_t GetVclNalTemporalId( PWelsDecoderContext pCtx )
+{
+	PAccessUnit pAccessUnit = pCtx->pAccessUnitList;
+	int32_t idx = pAccessUnit->uiStartPos;
+
+	pCtx->iFeedbackVclNalInAu = FEEDBACK_VCL_NAL;
+	pCtx->iFeedbackTidInAu    = pAccessUnit->pNalUnitsList[idx]->sNalHeaderExt.uiTemporalId;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	First entrance to decoding core interface.
+ *
+ * \param 	pCtx	        decoder context
+ * \param	pBufBs	        bit streaming buffer
+ * \param	kBsLen	        size in bytes length of bit streaming buffer input
+ * \param	ppDst	        picture payload data to be output
+ * \param	pDstBufInfo	    buf information of ouput data
+ *
+ * \return	0 - successed
+ * \return	1 - failed
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+int32_t WelsDecodeBs( PWelsDecoderContext pCtx, const uint8_t *kpBsBuf, const int32_t kiBsLen, 
+			   uint8_t **ppDst, SBufferInfo* pDstBufInfo)
+{	
+	if ( !pCtx->bEndOfStreamFlag)
+	{
+		SDataBuffer* pRawData   = &pCtx->sRawData;
+
+		int32_t iSrcIdx        = 0; //the index of source bit-stream till now after parsing one or more NALs
+		int32_t iSrcConsumed   = 0; // consumed bit count of source bs
+		int32_t iDstIdx        = 0; //the size of current NAL after 0x03 removal and 00 00 01 removal
+		int32_t iSrcLength     = 0;	//the total size of current AU or NAL
+
+		int32_t iConsumedBytes = 0;	
+		int32_t iOffset        = 0;	
+
+		uint8_t* pSrcNal       = NULL;
+		uint8_t* pDstNal       = NULL;
+		uint8_t *pNalPayload   = NULL;	
+		
+		
+		if ( NULL == DetectStartCodePrefix( kpBsBuf, &iOffset, kiBsLen ) ) //CAN'T find the 00 00 01 start prefix from the source buffer
+		{
+			return dsBitstreamError;
+		}
+
+		pSrcNal    = const_cast<uint8_t*> (kpBsBuf) + iOffset;
+		iSrcLength = kiBsLen - iOffset;
+
+		if ( (kiBsLen + 4) > ( pRawData->pEnd - pRawData->pCurPos ) )
+		{
+			pRawData->pCurPos = pRawData->pHead;
+		}
+
+
+		//copy raw data from source buffer (application) to raw data buffer (codec inside)
+		//0x03 removal and extract all of NAL Unit from current raw data
+		pDstNal = pRawData->pCurPos + 4; //4-bytes used to write the length of current NAL rbsp
+
+		while ( iSrcConsumed < iSrcLength )
+		{
+			if ( ( 2 + iSrcConsumed < iSrcLength ) && 
+				( 0 == LD16(pSrcNal+iSrcIdx) ) &&
+				( (pSrcNal[2+iSrcIdx]==0x03) || (pSrcNal[2+iSrcIdx]==0x01) ) )
+			{
+				if ( pSrcNal[2+iSrcIdx] == 0x03 )
+				{
+					ST16(pDstNal+iDstIdx, 0);
+					iDstIdx	+= 2;
+					iSrcIdx	+= 3;	
+					iSrcConsumed += 3;
+				}
+				else
+				{
+					GetValueOf4Bytes( pDstNal-4, iDstIdx );   //pDstNal-4 (non-aligned by 4) in Solaris10(SPARC). Given value by byte.
+
+					iConsumedBytes = 0;
+					pNalPayload	= ParseNalHeader( pCtx, &pCtx->sCurNalHead, pDstNal, iDstIdx, pSrcNal-3, iSrcIdx+3, &iConsumedBytes );
+					
+					if (pCtx->bAuReadyFlag)
+					{	
+						ConstructAccessUnit( pCtx, ppDst, pDstBufInfo );	
+
+						if ( (dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode)
+						{							
+#ifdef LONG_TERM_REF
+							pCtx->bParamSetsLostFlag = true;
+#else
+							pCtx->bReferenceLostAtT0Flag = true;
+#endif
+							ResetParameterSetsState( pCtx );
+
+                            if( dsOutOfMemory & pCtx->iErrorCode){
+  							   return pCtx->iErrorCode;
+                            }
+						}
+					}
+					
+					if( (IS_PARAM_SETS_NALS(pCtx->sCurNalHead.eNalUnitType) || IS_SEI_NAL(pCtx->sCurNalHead.eNalUnitType)) &&
+						pNalPayload )
+					{	
+						if ( ParseNonVclNal( pCtx, pNalPayload, iDstIdx-iConsumedBytes ) )
+						{
+							if ( dsNoParamSets & pCtx->iErrorCode )
+							{
+#ifdef LONG_TERM_REF
+								pCtx->bParamSetsLostFlag = true;
+#else
+								pCtx->bReferenceLostAtT0Flag = true;
+#endif
+								ResetParameterSetsState( pCtx );
+							}
+							return pCtx->iErrorCode;
+						}
+					}
+
+					pDstNal += iDstIdx; //update current position
+					if ( (iSrcLength - iSrcConsumed + 4) > (pRawData->pEnd - pDstNal) )
+					{
+						pRawData->pCurPos = pRawData->pHead;
+					}
+					else
+					{
+						pRawData->pCurPos = pDstNal;
+					}
+					pDstNal = pRawData->pCurPos + 4; //init, 4 bytes used to store the next NAL
+
+					pSrcNal += iSrcIdx+3;
+					iSrcConsumed += 3;						
+					iSrcIdx = 0;	
+					iDstIdx  = 0; //reset 0, used to statistic the length of next NAL					
+				}
+				continue;
+			}
+			pDstNal[iDstIdx++] = pSrcNal[iSrcIdx++];
+			iSrcConsumed++;
+		}
+		
+		//last NAL decoding
+		GetValueOf4Bytes( pDstNal-4, iDstIdx ); //pDstNal-4 (non-aligned by 4) in Solaris10(SPARC). Given value by byte.
+
+		iConsumedBytes = 0;
+		pNalPayload = ParseNalHeader( pCtx, &pCtx->sCurNalHead, pDstNal, iDstIdx, pSrcNal-3, iSrcIdx+3, &iConsumedBytes );
+
+		if (pCtx->bAuReadyFlag)
+		{	
+			ConstructAccessUnit( pCtx, ppDst, pDstBufInfo );
+
+			if ( (dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode)
+			{				
+#ifdef LONG_TERM_REF
+				pCtx->bParamSetsLostFlag = true;
+#else
+				pCtx->bReferenceLostAtT0Flag = true;
+#endif
+				ResetParameterSetsState( pCtx );
+				return pCtx->iErrorCode;
+			}			
+		}
+
+		if( (IS_PARAM_SETS_NALS(pCtx->sCurNalHead.eNalUnitType) || IS_SEI_NAL(pCtx->sCurNalHead.eNalUnitType)) && pNalPayload )
+		{
+			if ( ParseNonVclNal( pCtx, pNalPayload, iDstIdx-iConsumedBytes ) )
+			{
+				if ( dsNoParamSets & pCtx->iErrorCode )
+				{
+#ifdef LONG_TERM_REF
+					pCtx->bParamSetsLostFlag = true;
+#else
+					pCtx->bReferenceLostAtT0Flag = true;
+#endif
+					ResetParameterSetsState( pCtx );
+				}
+				return pCtx->iErrorCode;
+			}
+		}	
+
+		pDstNal += iDstIdx;
+		pRawData->pCurPos = pDstNal; //init the pCurPos for next NAL(s) storage
+	}	
+	else  /* no supplementary picture payload input, but stored a picture */
+	{
+		PAccessUnit pCurAu	= pCtx->pAccessUnitList;	// current access unit, it will never point to NULL after decode's successful initialization
+		
+		if ( pCurAu->uiAvailUnitsNum == 0 )
+		{
+			return pCtx->iErrorCode;
+		}
+		else
+		{			
+			pCtx->pAccessUnitList->uiEndPos = pCtx->pAccessUnitList->uiAvailUnitsNum - 1;
+			
+			ConstructAccessUnit( pCtx, ppDst, pDstBufInfo );
+
+			if ( (dsOutOfMemory | dsNoParamSets) & pCtx->iErrorCode)
+			{				
+#ifdef LONG_TERM_REF
+				pCtx->bParamSetsLostFlag = true;
+#else
+				pCtx->bReferenceLostAtT0Flag = true;
+#endif
+				ResetParameterSetsState( pCtx );
+				return pCtx->iErrorCode;
+			}
+			
+		}
+	}
+
+	return pCtx->iErrorCode;
+}
+
+/*
+ * set colorspace format in decoder
+ */
+int32_t DecoderSetCsp(PWelsDecoderContext pCtx, const int32_t kiColorFormat)
+{
+	WELS_VERIFY_RETURN_IF( 1, (NULL == pCtx) );
+
+	pCtx->iOutputColorFormat	= kiColorFormat;
+	if ( pCtx->pParam != NULL )
+	{
+		pCtx->pParam->iOutputColorFormat	= kiColorFormat;
+	}
+
+	return 0;
+}
+
+/*!
+ * \brief	make sure synchonozization picture resolution (get from slice header) among different parts (i.e, memory related and so on)
+ *			over decoder internal
+ * ( MB coordinate and parts of data within decoder context structure )
+ * \param	pCtx		Wels decoder context
+ * \param	iMbWidth	MB width
+ * \pram	iMbHeight	MB height 
+ * \return	0 - successful; none 0 - something wrong
+ */
+int32_t SyncPictureResolutionExt( PWelsDecoderContext pCtx, const int32_t kiMbWidth, const int32_t kiMbHeight )
+{
+	int32_t iErr = ERR_NONE;
+	const int32_t kiPicWidth	= kiMbWidth << 4;
+	const int32_t kiPicHeight   = kiMbHeight<< 4;
+	
+	iErr = WelsRequestMem( pCtx, kiMbWidth, kiMbHeight );	// common memory used
+	if ( ERR_NONE != iErr )
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, "SyncPictureResolutionExt()::WelsRequestMem--buffer allocated failure.\n" );
+		pCtx->iErrorCode = dsOutOfMemory;
+		return iErr;	
+	}
+
+	iErr = InitialDqLayersContext( pCtx, kiPicWidth, kiPicHeight );
+	if ( ERR_NONE != iErr )
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, "SyncPictureResolutionExt()::InitialDqLayersContext--buffer allocated failure.\n" );
+		pCtx->iErrorCode = dsOutOfMemory;
+	}	
+
+	return iErr;
+}
+
+/*!
+ * \brief	update maximal picture width and height if applicable when receiving a SPS NAL
+ */
+void_t UpdateMaxPictureResolution( PWelsDecoderContext pCtx, const int32_t kiCurWidth, const int32_t kiCurHeight )
+{
+	//any dimension larger than that of current dimension, should modify the max-dimension
+	if ( kiCurWidth > pCtx->iMaxWidthInSps || kiCurHeight > pCtx->iMaxHeightInSps)		
+	{
+		pCtx->iMaxWidthInSps	= kiCurWidth;
+		pCtx->iMaxHeightInSps	= kiCurHeight;
+	}
+
+	return;
+}
+
+void_t AssignFuncPointerForRec(PWelsDecoderContext pCtx)
+{
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_V     ] = WelsI16x16LumaPredV_c;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_H     ] = WelsI16x16LumaPredH_c;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC    ] = WelsI16x16LumaPredDc_c;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_P     ] = WelsI16x16LumaPredPlane_c;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_L  ] = WelsI16x16LumaPredDcLeft_c;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T  ] = WelsI16x16LumaPredDcTop_c;
+	pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_c;
+
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_V     ] = WelsI4x4LumaPredV_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_H     ] = WelsI4x4LumaPredH_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC    ] = WelsI4x4LumaPredDc_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_L  ] = WelsI4x4LumaPredDcLeft_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_T  ] = WelsI4x4LumaPredDcTop_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DC_128] = WelsI4x4LumaPredDcNA_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL    ] = WelsI4x4LumaPredDDL_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR    ] = WelsI4x4LumaPredDDR_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL    ] = WelsI4x4LumaPredVL_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL_TOP] = WelsI4x4LumaPredVLTop_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR    ] = WelsI4x4LumaPredVR_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU    ] = WelsI4x4LumaPredHU_c;
+	pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD    ] = WelsI4x4LumaPredHD_c;
+		
+	pCtx->pGetIChromaPredFunc[C_PRED_DC    ] = WelsIChromaPredDc_c;
+	pCtx->pGetIChromaPredFunc[C_PRED_H     ] = WelsIChromaPredH_c;
+	pCtx->pGetIChromaPredFunc[C_PRED_V     ] = WelsIChromaPredV_c;
+	pCtx->pGetIChromaPredFunc[C_PRED_P     ] = WelsIChromaPredPlane_c;
+	pCtx->pGetIChromaPredFunc[C_PRED_DC_L  ] = WelsIChromaPredDcLeft_c;
+	pCtx->pGetIChromaPredFunc[C_PRED_DC_T  ] = WelsIChromaPredDcTop_c;
+	pCtx->pGetIChromaPredFunc[C_PRED_DC_128] = WelsIChromaPredDcNA_c;
+
+	InitDctClipTable();
+	pCtx->pIdctResAddPredFunc	= IdctResAddPred_c;
+
+#if defined(X86_ASM)
+	if ( pCtx->uiCpuFlag & WELS_CPU_MMXEXT )
+	{		
+		pCtx->pIdctResAddPredFunc	= IdctResAddPred_mmx;	
+
+		/////////mmx code opt---
+		pCtx->pGetIChromaPredFunc[C_PRED_H]      = WelsIChromaPredH_mmx;
+		pCtx->pGetIChromaPredFunc[C_PRED_V]      = WelsIChromaPredV_mmx;
+		pCtx->pGetIChromaPredFunc[C_PRED_DC_L  ] = WelsIChromaPredDcLeft_mmx;		
+		pCtx->pGetIChromaPredFunc[C_PRED_DC_128] = WelsIChromaPredDcNA_mmx;
+		pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR]  = WelsI4x4LumaPredDDR_mmx;
+		pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ]  = WelsI4x4LumaPredHD_mmx;
+		pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ]  = WelsI4x4LumaPredHU_mmx;
+		pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ]  = WelsI4x4LumaPredVR_mmx;
+		pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL]  = WelsI4x4LumaPredDDL_mmx;
+		pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ]  = WelsI4x4LumaPredVL_mmx;
+	}
+	if ( pCtx->uiCpuFlag & WELS_CPU_SSE2 )
+	{
+		/////////sse2 code opt---
+		pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsI16x16LumaPredDc_sse2;
+		pCtx->pGetI16x16LumaPredFunc[I16_PRED_P]  = WelsI16x16LumaPredPlane_sse2;
+		pCtx->pGetI16x16LumaPredFunc[I16_PRED_H]  = WelsI16x16LumaPredH_sse2;
+		pCtx->pGetI16x16LumaPredFunc[I16_PRED_V]  = WelsI16x16LumaPredV_sse2;
+		pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_T  ] = WelsI16x16LumaPredDcTop_sse2;
+		pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_sse2;
+		pCtx->pGetIChromaPredFunc[C_PRED_P ]      = WelsIChromaPredPlane_sse2;
+		pCtx->pGetIChromaPredFunc[C_PRED_DC]      = WelsIChromaPredDc_sse2;
+		pCtx->pGetIChromaPredFunc[C_PRED_DC_T]    = WelsIChromaPredDcTop_sse2;
+	}
+#endif
+	DeblockingInit(&pCtx->sDeblockingFunc, pCtx->uiCpuFlag);
+
+	WelsBlockFuncInit(&pCtx->sBlockFunc, pCtx->uiCpuFlag);
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -1,0 +1,2106 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	decoder_core.c:	Wels decoder framework core implementation
+ */
+
+#include <string.h>
+#include "codec_def.h"
+#include "decoder_core.h"
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "codec_app_def.h"
+#include "decoder_context.h"
+#include "dec_golomb.h"
+#include "bit_stream.h"
+#include "error_code.h"
+#include "parameter_sets.h"
+#include "fmo.h"
+#include "utils.h"
+#include "memmgr_nal_unit.h"
+#include "dec_frame.h"
+#include "au_parser.h"
+#include "pic_queue.h"
+#include "ls_defines.h"
+#include "decode_slice.h"
+#include "manage_dec_ref.h"
+#include "expand_pic.h"
+#include "decoder.h"
+#include "decode_mb_aux.h"
+#include "mem_align.h"
+
+namespace WelsDec {
+
+static inline int32_t DecodeFrameConstruction( PWelsDecoderContext pCtx, uint8_t **ppDst, int32_t *pDstLen, int32_t *pWidth, int32_t *pHeight, SBufferInfo *pDstInfo )
+{
+	PDqLayer pCurDq = pCtx->pCurDqLayer;	
+	PPicture pPic = pCtx->pDec;
+
+	const int32_t kiWidth = pCurDq->iMbWidth << 4;
+	const int32_t kiHeight= pCurDq->iMbHeight << 4;
+
+	const int32_t kiTotalNumMbInCurLayer = pCurDq->iMbWidth * pCurDq->iMbHeight;
+
+	if ( pPic->iTotalNumMbRec != kiTotalNumMbInCurLayer )
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, "DecodeFrameConstruction():::iTotalNumMbRec:%d, total_num_mb_sps:%d, cur_layer_mb_width:%d, cur_layer_mb_height:%d \n",
+			pPic->iTotalNumMbRec, kiTotalNumMbInCurLayer, pCurDq->iMbWidth, pCurDq->iMbHeight );
+		return -1;
+	}
+#ifdef NO_WAITING_AU
+    pPic->iTotalNumMbRec = 0;
+#endif
+
+	if ( I_SLICE == pCurDq->sLayerInfo.sSliceInLayer.eSliceType )
+	{
+		memcpy( &(pCtx->sFrameCrop), &(pCurDq->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.pSps->sFrameCrop), sizeof(SPosOffset) );//confirmed_safe_unsafe_usage
+#ifdef LONG_TERM_REF
+		pCtx->bParamSetsLostFlag      = false;
+#else
+		pCtx->bReferenceLostAtT0Flag = false;	// need initialize it due I_SLICE, 6/4/2010
+#endif //LONG_TERM_REF
+		WelsLog( pCtx, WELS_LOG_INFO, "DecodeFrameConstruction()::::output good I frame, %d x %d, crop_left:%d, crop_right:%d, crop_top:%d, crop_bottom:%d.\n",
+			kiWidth, kiHeight, pCtx->sFrameCrop.iLeftOffset, pCtx->sFrameCrop.iRightOffset, pCtx->sFrameCrop.iTopOffset, pCtx->sFrameCrop.iBottomOffset );
+		WelsLog( pCtx, WELS_LOG_INFO, "After decoding, set_mode:[%s], eWorkMode:[%s], eBufferProperty:[%s]\n",
+			DECODER_MODE_NAME(pCtx->iSetMode), DECODER_MODE_NAME(pCtx->iDecoderMode), OUTPUT_PROPERTY_NAME(pDstInfo->eBufferProperty));
+	}
+	
+	//////output:::normal path
+	ppDst[0]      = pPic->pData[0];
+	ppDst[1]      = pPic->pData[1];
+	ppDst[2]      = pPic->pData[2];
+	*pDstLen     = pPic->iLinesize[0]; 
+	*(pDstLen+1) = pPic->iLinesize[1];
+	*pWidth      = kiWidth;
+	*pHeight     = kiHeight;
+
+	pDstInfo->UsrData.sSystemBuffer.iFormat = videoFormatI420;
+
+	pDstInfo->UsrData.sSystemBuffer.iWidth = kiWidth - (pCtx->sFrameCrop.iLeftOffset + pCtx->sFrameCrop.iRightOffset)*2;
+	pDstInfo->UsrData.sSystemBuffer.iHeight = kiHeight - (pCtx->sFrameCrop.iTopOffset + pCtx->sFrameCrop.iBottomOffset)*2;
+	pDstInfo->UsrData.sSystemBuffer.iStride[0] = pPic->iLinesize[0];
+	pDstInfo->UsrData.sSystemBuffer.iStride[1] = pPic->iLinesize[1];
+	ppDst[0] = ppDst[0] + pCtx->sFrameCrop.iTopOffset*2*pPic->iLinesize[0] + pCtx->sFrameCrop.iLeftOffset*2;
+	ppDst[1] = ppDst[1] + pCtx->sFrameCrop.iTopOffset  *pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
+	ppDst[2] = ppDst[2] + pCtx->sFrameCrop.iTopOffset  *pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
+	pDstInfo->eBufferProperty = BUFFER_HOST;
+	pDstInfo->iBufferStatus = 1;
+
+	return 0;
+}
+
+inline BOOL_T    CheckSliceNeedReconstruct(int16_t iCurDid, int16_t iCurQid, bool_t bStoreRefBasePicFlag, 
+	uint8_t uiDidMax, uint8_t uiLayerDqId, uint8_t uiTargetDqId)
+{
+    return ( (iCurDid == uiDidMax) && (iCurQid == BASE_QUALITY_ID) && (bStoreRefBasePicFlag) ) // store base
+       || (uiLayerDqId == uiTargetDqId); // target layer
+}
+
+inline uint8_t GetTargetDqId(uint8_t uiTargetDqId,  SDecodingParam * psParam)
+{
+    uint8_t  uiRequiredDqId = psParam ? psParam->uiTargetDqLayer : (uint8_t)255;
+
+	return WELS_MIN(uiTargetDqId, uiRequiredDqId);
+}
+	
+
+inline void_t    HandleReferenceLostL0(PWelsDecoderContext pCtx, PNalUnit pCurNal)
+{
+    if( 0 == pCurNal->sNalHeaderExt.uiTemporalId ){
+		pCtx->bReferenceLostAtT0Flag = true;
+    }
+#ifndef LONG_TERM_REF
+	if( pCtx->bReferenceLostAtT0Flag ){
+		ResetParameterSetsState(pCtx);
+	}	
+#endif
+	pCtx->iErrorCode |= dsBitstreamError;
+}
+
+inline void_t    HandleReferenceLost(PWelsDecoderContext pCtx, PNalUnit pCurNal)
+{
+    if( (0 == pCurNal->sNalHeaderExt.uiTemporalId) || (1 == pCurNal->sNalHeaderExt.uiTemporalId) ){
+		pCtx->bReferenceLostAtT0Flag = true;
+    }
+#ifndef LONG_TERM_REF
+	if( pCtx->bReferenceLostAtT0Flag ){
+		ResetParameterSetsState(pCtx);
+    }
+#endif
+	pCtx->iErrorCode |= dsRefLost;
+}
+
+inline int32_t  WelsDecodeConstructSlice(PWelsDecoderContext pCtx, PNalUnit pCurNal)
+{
+    int32_t  iRet = WelsTargetSliceConstruction(pCtx);
+
+	if( iRet ){
+		HandleReferenceLostL0(pCtx, pCurNal);
+	}
+
+	return iRet;
+}
+
+/*
+ *	Predeclared function routines ..
+ */
+int32_t ParseRefPicListReordering ( PBitStringAux pBs, PSliceHeader pSh )
+{
+	int32_t iList = 0;
+	const ESliceType keSt = pSh->eSliceType;
+	PRefPicListReorderSyn pRefPicListReordering = &pSh->pRefPicListReordering;
+	
+	if ( keSt == I_SLICE || keSt == SI_SLICE )
+		return ERR_NONE;
+
+	// Common syntaxs for P or B slices: list0, list1 followed if B slices used.
+	do {
+		pRefPicListReordering->bRefPicListReorderingFlag[iList]	= !!BsGetOneBit( pBs);
+
+		if ( pRefPicListReordering->bRefPicListReorderingFlag[iList] ){
+			int32_t iIdx = 0;
+			do {
+				const uint8_t kuiIdc = BsGetUe( pBs );	
+				
+				//Fixed the referrence list reordering crash issue.(fault kIdc value > 3 case)---
+				if ((iIdx >= MAX_REF_PIC_COUNT )||(kuiIdc > 3))
+				{
+					return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_REF_REORDERING);
+				}
+				pRefPicListReordering->sReorderingSyn[iList][iIdx].uiReorderingOfPicNumsIdc	= kuiIdc;
+				if ( kuiIdc == 3 )
+					break;
+
+				if ( iIdx >= pSh->uiRefCount[iList] || iIdx >= MAX_REF_PIC_COUNT )
+					return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_REF_REORDERING);
+				
+				if (kuiIdc == 0 || kuiIdc == 1){
+					pRefPicListReordering->sReorderingSyn[iList][iIdx].uiAbsDiffPicNumMinus1 = BsGetUe( pBs );	// uiAbsDiffPicNumMinus1
+				}
+				else if (kuiIdc == 2){				
+					pRefPicListReordering->sReorderingSyn[iList][iIdx].uiLongTermPicNum= BsGetUe( pBs );			
+				}
+				
+				++ iIdx;
+			} while(true);
+		}
+		if (keSt != B_SLICE)
+			break;
+		++ iList;
+	} while(iList < LIST_A);
+	
+	return ERR_NONE;
+}
+
+int32_t ParseDecRefPicMarking ( PWelsDecoderContext pCtx, PBitStringAux pBs, PSliceHeader pSh, PSps pSps, const bool_t kbIdrFlag)
+{
+	PRefPicMarking const kpRefMarking = &pSh->sRefMarking;
+
+	if ( kbIdrFlag ){
+		kpRefMarking->bNoOutputOfPriorPicsFlag	= !!BsGetOneBit( pBs );
+		kpRefMarking->bLongTermRefFlag			= !!BsGetOneBit( pBs );
+	}
+	else{
+		kpRefMarking->bAdaptiveRefPicMarkingModeFlag	= !!BsGetOneBit( pBs );
+		if (kpRefMarking->bAdaptiveRefPicMarkingModeFlag){
+			int32_t iIdx = 0;
+			do {
+				const int32_t kiMmco = BsGetUe( pBs );
+
+				kpRefMarking->sMmcoRef[iIdx].uiMmcoType = kiMmco;
+				if (kiMmco == MMCO_END)
+					break;
+
+				if (kiMmco == MMCO_SHORT2UNUSED || kiMmco == MMCO_SHORT2LONG)
+				{
+					kpRefMarking->sMmcoRef[iIdx].iDiffOfPicNum = 1 + BsGetUe( pBs );
+					kpRefMarking->sMmcoRef[iIdx].iShortFrameNum = (pSh->iFrameNum - kpRefMarking->sMmcoRef[iIdx].iDiffOfPicNum) & ((1<<pSps->uiLog2MaxFrameNum)-1);
+				}
+				else if (kiMmco == MMCO_LONG2UNUSED)
+					kpRefMarking->sMmcoRef[iIdx].uiLongTermPicNum = BsGetUe( pBs );
+
+				if (kiMmco == MMCO_SHORT2LONG || kiMmco == MMCO_LONG)
+				{
+					kpRefMarking->sMmcoRef[iIdx].iLongTermFrameIdx = BsGetUe( pBs );
+				}
+				else if (kiMmco == MMCO_SET_MAX_LONG)
+					kpRefMarking->sMmcoRef[iIdx].iMaxLongTermFrameIdx = -1 + BsGetUe( pBs );
+				++ iIdx;
+
+			} while(iIdx < MAX_MMCO_COUNT);
+		}	
+	}
+	
+	return ERR_NONE;
+}
+
+bool_t FillDefaultSliceHeaderExt ( PSliceHeaderExt pShExt, PNalUnitHeaderExt pNalExt )
+{
+	if ( pShExt == NULL || pNalExt == NULL )
+		return false;
+
+	if ( pNalExt->iNoInterLayerPredFlag || pNalExt->uiQualityId > 0 )
+		pShExt->bBasePredWeightTableFlag	= false;
+	else
+		pShExt->bBasePredWeightTableFlag	= true;
+    pShExt->uiRefLayerDqId = (uint8_t)-1;
+	pShExt->uiDisableInterLayerDeblockingFilterIdc	= 0;
+	pShExt->iInterLayerSliceAlphaC0Offset			= 0;
+	pShExt->iInterLayerSliceBetaOffset				= 0;
+	pShExt->bConstrainedIntraResamplingFlag			= false;
+	pShExt->uiRefLayerChromaPhaseXPlus1Flag			= 0;
+	pShExt->uiRefLayerChromaPhaseYPlus1				= 1;
+	//memset(&pShExt->sScaledRefLayer, 0, sizeof(SPosOffset));
+
+	pShExt->iScaledRefLayerPicWidthInSampleLuma	= pShExt->sSliceHeader.iMbWidth << 4;
+	pShExt->iScaledRefLayerPicHeightInSampleLuma	= pShExt->sSliceHeader.iMbHeight << 4;
+
+	pShExt->bSliceSkipFlag	= false;
+	pShExt->bAdaptiveBaseModeFlag	= false;
+	pShExt->bDefaultBaseModeFlag	= false;
+	pShExt->bAdaptiveMotionPredFlag	= false;
+	pShExt->bDefaultMotionPredFlag	= false;
+	pShExt->bAdaptiveResidualPredFlag	= false;
+	pShExt->bDefaultResidualPredFlag	= false;
+	pShExt->bTCoeffLevelPredFlag		= false;
+	pShExt->uiScanIdxStart				= 0;
+	pShExt->uiScanIdxEnd				= 15;
+
+	return true;
+}
+
+/*
+ * WelsInitMemory
+ * Memory request for new introduced data
+ * Especially for:
+ * rbsp_au_buffer, cur_dq_layer_ptr and ref_dq_layer_ptr in MB info cache.
+ * return:
+ *	0 - success; otherwise returned error_no defined in error_no.h.
+*/
+int32_t WelsInitMemory( PWelsDecoderContext pCtx )
+{
+	if (pCtx == NULL){
+		return ERR_INFO_INVALID_PTR;
+	}
+
+	if ( MemInitNalList( &pCtx->pAccessUnitList, MAX_NAL_UNIT_NUM_IN_AU ) != 0 )
+		return ERR_INFO_OUT_OF_MEMORY;	
+
+	if ( ( pCtx->sRawData.pHead = static_cast<uint8_t*> (WelsMalloc( MAX_ACCESS_UINT_CAPACITY, "pCtx->sRawData->pHead" )) ) == NULL )
+	{
+		return ERR_INFO_OUT_OF_MEMORY;
+	}
+	pCtx->sRawData.pStartPos               =
+	pCtx->sRawData.pCurPos                 = pCtx->sRawData.pHead;
+	pCtx->sRawData.pEnd                     = pCtx->sRawData.pHead + MAX_ACCESS_UINT_CAPACITY;	
+	
+	pCtx->uiTargetDqId			= (uint8_t)-1;
+	pCtx->bEndOfStreamFlag	= false;
+	pCtx->iImgWidthInPixel	= 0;
+	pCtx->iImgHeightInPixel	= 0;	
+	
+	return ERR_NONE;
+}
+
+/*
+ * WelsFreeMemory
+ * Free memory introduced in WelsInitMemory at destruction of decoder.
+ * 
+ */
+void_t WelsFreeMemory( PWelsDecoderContext pCtx )
+{
+	if ( pCtx == NULL )
+		return;
+
+	if ( NULL != pCtx->pParam )
+	{
+		WelsFree( pCtx->pParam, "pCtx->pParam" );
+
+		pCtx->pParam = NULL;
+	}
+
+	MemFreeNalList( &pCtx->pAccessUnitList );
+			
+	if ( pCtx->sRawData.pHead )
+	{
+		WelsFree(pCtx->sRawData.pHead, "pCtx->sRawData->pHead");		
+	}
+	pCtx->sRawData.pHead                = NULL;
+	pCtx->sRawData.pEnd                 = NULL;
+	pCtx->sRawData.pStartPos	        = NULL;
+	pCtx->sRawData.pCurPos             = NULL;	
+}
+
+/*
+ *	DecodeNalHeaderExt
+ *	Trigger condition: NAL_UNIT_TYPE = NAL_UNIT_PREFIX or NAL_UNIT_CODED_SLICE_EXT
+ *	Parameter:
+ *	pNal:	target NALUnit ptr
+ *	pSrc:	NAL Unit bitstream
+ */
+void_t DecodeNalHeaderExt( PNalUnit pNal, uint8_t* pSrc )
+{
+	PNalUnitHeaderExt pHeaderExt = &pNal->sNalHeaderExt;	
+
+	uint8_t uiCurByte = *pSrc;	
+	pHeaderExt->bIdrFlag				 = !!(uiCurByte & 0x40);
+	pHeaderExt->uiPriorityId			 = uiCurByte & 0x3F;
+
+	uiCurByte = *(++pSrc);
+	pHeaderExt->iNoInterLayerPredFlag = uiCurByte >> 7;
+	pHeaderExt->uiDependencyId			 = (uiCurByte & 0x70) >> 4;
+	pHeaderExt->uiQualityId				 = uiCurByte & 0x0F;
+	uiCurByte = *(++pSrc);
+	pHeaderExt->uiTemporalId			 = uiCurByte >> 5;
+	pHeaderExt->bUseRefBasePicFlag	     = !!(uiCurByte & 0x10);
+	pHeaderExt->bDiscardableFlag		 = !!(uiCurByte & 0x08);
+	pHeaderExt->bOutputFlag				 = !!(uiCurByte & 0x04);
+	pHeaderExt->uiReservedThree2Bits	 = uiCurByte & 0x03;	
+	pHeaderExt->uiLayerDqId				 = (pHeaderExt->uiDependencyId << 4) | pHeaderExt->uiQualityId;
+}
+
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+int32_t CheckPpsId( PWelsDecoderContext pCtx, PPps* ppPps, uint32_t uiPpsId )
+{
+	PPps pPpsList = pCtx->sPpsBuffer;
+	int32_t iPpsNum = pCtx->iPpsTotalNum;
+	int32_t i = 0;
+
+	if ( iPpsNum <= 0 )
+	{
+		pCtx->iErrorCode |= dsNoParamSets;
+		
+		WelsLog( pCtx, WELS_LOG_WARNING, "CheckPpsId():::::PPS list is empty...NO PPS!!!\n" );
+		return dsNoParamSets;
+	}
+
+	while ( i < iPpsNum )
+	{
+		if ( uiPpsId == pPpsList[i].iPpsId )
+		{
+			*ppPps = &pPpsList[i];
+			break;
+		}
+		else
+		{
+			++i;
+		}
+	}
+
+	if ( i == iPpsNum )
+	{
+		pCtx->iErrorCode |= dsNoParamSets;
+		
+		WelsLog( pCtx, WELS_LOG_WARNING, "CheckPpsId()::::::CAN NOT find the matching from the PPS List.  iPpsId:%d\n", uiPpsId );
+		return dsNoParamSets;
+	}
+
+	return 0;
+}
+
+int32_t CheckSpsId( PWelsDecoderContext pCtx, PSubsetSps* ppSubsetSps, PSps* ppSps, int32_t iSpsId, bool_t bExtensionFlag )
+{
+	PSps pSpsList = pCtx->sSpsBuffer;
+	PSubsetSps pSubspsList = pCtx->sSubsetSpsBuffer;
+
+	int32_t iSpsNum    = pCtx->iSpsTotalNum;
+	int32_t iSubspsNum = pCtx->iSubspsTotalNum;
+	int32_t i = 0;
+
+	if ( bExtensionFlag )
+	{
+		if ( iSubspsNum <= 0 )
+		{
+			pCtx->iErrorCode |= dsNoParamSets;
+			
+			WelsLog( pCtx, WELS_LOG_WARNING, "CheckSpsId()::::SUBSPS list is empty....NO SUBSPS\n" );
+			return dsNoParamSets;
+		}
+		while ( i < iSubspsNum )
+		{
+			if ( iSpsId == pSubspsList[i].sSps.iSpsId )
+			{
+				*ppSubsetSps = &pSubspsList[i];
+				*ppSps       = &pSubspsList[i].sSps;
+				break;
+			}
+			else
+			{
+				++i;
+			}
+		}
+		if ( i == iSubspsNum )
+		{
+			pCtx->iErrorCode |= dsNoParamSets;
+			
+			WelsLog( pCtx, WELS_LOG_WARNING, "CheckSpsId()::::::CAN NOT find the matching from the SUBSPS List.  iSpsId:%d\n", iSpsId );
+			return dsNoParamSets;
+		}
+	}
+	else
+	{
+		if ( iSpsNum <= 0 )
+		{
+			pCtx->iErrorCode |= dsNoParamSets;
+			
+			WelsLog( pCtx, WELS_LOG_WARNING, "CheckSpsId()::::SPS list is empty....NO SPS\n" );
+			return dsNoParamSets;
+		}
+		while ( i < iSpsNum )
+		{
+			if ( iSpsId == pSpsList[i].iSpsId )
+			{
+				*ppSubsetSps = NULL;
+				*ppSps       = &pSpsList[i];
+				break;
+			}
+			else
+			{
+				++i;
+			}
+		}
+		if ( i == iSpsNum )
+		{
+			pCtx->iErrorCode |= dsNoParamSets;
+			
+			WelsLog( pCtx, WELS_LOG_WARNING, "CheckSpsId()::::::CAN NOT find the matching from the SPS List.  iSpsId:%d\n", iSpsId );
+			return dsNoParamSets;
+		}
+	}
+	
+	return 0;
+}
+
+#endif
+/*
+ *	decode_slice_header_avc
+ *	Parse slice header of bitstream in avc for storing data structure
+ */
+int32_t ParseSliceHeaderSyntaxs ( PWelsDecoderContext pCtx, PBitStringAux pBs, const bool_t kbExtensionFlag )
+{
+	PNalUnit const kpCurNal				= pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum-1];
+	
+	PNalUnitHeaderExt pNalHeaderExt	= NULL;
+	PSliceHeader pSliceHead			= NULL;
+	PSliceHeaderExt pSliceHeadExt	= NULL;
+	PSubsetSps pSubsetSps				= NULL;
+	PSps pSps							= NULL;
+	PPps pPps							= NULL;
+	ENalUnitType eNalType				= static_cast<ENalUnitType> (0);
+	int32_t iPpsId						= 0;
+	int32_t iRet						= ERR_NONE;
+	uint8_t uiSliceType				= 0;
+	uint8_t uiQualityId					= BASE_QUALITY_ID;
+	bool_t	bIdrFlag					= false;
+	bool_t	bSgChangeCycleInvolved	= false;	// involved slice group change cycle ?
+		
+	if (kpCurNal == NULL)
+	{	
+		return ERR_INFO_OUT_OF_MEMORY;
+	}
+	
+	pNalHeaderExt	= &kpCurNal->sNalHeaderExt;
+	pSliceHead		= &kpCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+	eNalType		= pNalHeaderExt->sNalUnitHeader.eNalUnitType;
+	
+	pSliceHeadExt	= &kpCurNal->sNalData.sVclNal.sSliceHeaderExt;
+
+	if ( pSliceHeadExt ){
+		SRefBasePicMarking sBaseMarking;
+		const bool_t kbStoreRefBaseFlag = pSliceHeadExt->bStoreRefBasePicFlag;
+		memcpy(&sBaseMarking, &pSliceHeadExt->sRefBasePicMarking, sizeof(SRefBasePicMarking));//confirmed_safe_unsafe_usage
+		memset(pSliceHeadExt, 0, sizeof(SSliceHeaderExt));
+		pSliceHeadExt->bStoreRefBasePicFlag	= kbStoreRefBaseFlag;
+		memcpy(&pSliceHeadExt->sRefBasePicMarking, &sBaseMarking, sizeof(SRefBasePicMarking));//confirmed_safe_unsafe_usage
+	}
+	
+	kpCurNal->sNalData.sVclNal.bSliceHeaderExtFlag	= kbExtensionFlag;
+
+	pSliceHead->iFirstMbInSlice	= BsGetUe( pBs );
+
+    uiSliceType= BsGetUe( pBs );
+    if(uiSliceType > 9)
+	{
+		WelsLog( pCtx, WELS_LOG_WARNING, "slice type too large (%d) at first_mb(%d)\n", uiSliceType, pSliceHead->iFirstMbInSlice);
+		return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_SLICE_TYPE);
+    }
+    if(uiSliceType > 4)
+		uiSliceType -= 5;
+	
+	if ( kbExtensionFlag ){	
+		if (uiSliceType > 2){
+			WelsLog( pCtx, WELS_LOG_WARNING, "Invalid slice type(%d).\n", uiSliceType);
+			return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_SLICE_TYPE);
+		}
+	}
+	
+	pSliceHead->eSliceType	= static_cast <ESliceType> (uiSliceType);
+
+    iPpsId= BsGetUe( pBs );
+    
+	if(iPpsId >= MAX_PPS_COUNT){
+		WelsLog( pCtx, WELS_LOG_WARNING, "iPpsId out of range\n");
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_PPS_ID_OVERFLOW);
+    }
+
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+	if ( CheckPpsId( pCtx, &pPps, iPpsId ) )
+	{
+		return dsNoParamSets;
+	}
+#else
+	pPps    = &pCtx->sPpsBuffer[iPpsId];
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+
+	if (pPps->uiNumSliceGroups == 0){
+		WelsLog( pCtx, WELS_LOG_WARNING, "non existing PPS referenced\n");
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_NO_PARAM_SETS);
+    }
+
+	if (pPps->iSpsId >= MAX_SPS_COUNT){
+		WelsLog( pCtx, WELS_LOG_WARNING, "iSpsId out of range\n");
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_SPS_ID_OVERFLOW);
+	}
+
+	
+#ifdef MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+	if ( CheckSpsId( pCtx, &pSubsetSps, &pSps, pPps->iSpsId, kExtensionFlag ) )
+	{
+		return dsNoParamSets;
+	}
+#else	
+	if ( kbExtensionFlag )
+	{
+		pSubsetSps	= &pCtx->sSubsetSpsBuffer[pPps->iSpsId];
+		pSps		= &pSubsetSps->sSps;
+	}
+	else
+	{
+		pSps		= &pCtx->sSpsBuffer[pPps->iSpsId];
+	}
+	pCtx->pSps			= pSps;
+#endif //MOSAIC_AVOID_BASED_ON_SPS_PPS_ID
+	pSliceHead->iPpsId = iPpsId;
+	pSliceHead->iSpsId = pPps->iSpsId; 
+	pSliceHead->pPps   = pPps;
+	pSliceHead->pSps   = pSps;
+
+	pSliceHeadExt->pSubsetSps = pSubsetSps;
+	
+	bIdrFlag = (!kbExtensionFlag && eNalType == NAL_UNIT_CODED_SLICE_IDR) || (kbExtensionFlag && pNalHeaderExt->bIdrFlag);
+
+    if(pSps->uiLog2MaxFrameNum == 0){
+		WelsLog( pCtx, WELS_LOG_WARNING, "non existing SPS referenced\n");
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_NO_PARAM_SETS);
+    }
+	pSliceHead->iFrameNum = BsGetBits(pBs, pSps->uiLog2MaxFrameNum);	
+
+	pSliceHead->bFieldPicFlag		= false;
+	pSliceHead->bBottomFiledFlag	= false;
+	if( !pSps->bFrameMbsOnlyFlag ){
+        WelsLog( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): frame_mbs_only_flag = %d not supported. \n", pSps->bFrameMbsOnlyFlag );
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MBAFF);
+	}
+	pSliceHead->iMbWidth	= pSps->iMbWidth;
+	pSliceHead->iMbHeight	= pSps->iMbHeight / (1 + pSliceHead->bFieldPicFlag);
+	
+	if ( bIdrFlag ){
+		if ( pSliceHead->iFrameNum != 0 ){
+			WelsLog( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(), invaild frame number: %d due to IDR frame introduced!\n", pSliceHead->iFrameNum);
+			return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_FRAME_NUM);
+		}
+		pSliceHead->uiIdrPicId	= BsGetUe(pBs); /* uiIdrPicId */
+#ifdef LONG_TERM_REF
+		pCtx->uiCurIdrPicId      = pSliceHead->uiIdrPicId;
+#endif
+	}
+	
+	pSliceHead->iDeltaPicOrderCntBottom	= 0;
+	pSliceHead->iDeltaPicOrderCnt[0]		=
+	pSliceHead->iDeltaPicOrderCnt[1]		= 0;
+	if(pSps->uiPocType == 0){
+		pSliceHead->iPicOrderCntLsb	= BsGetBits(pBs, pSps->iLog2MaxPocLsb);	
+		if( pPps->bPicOrderPresentFlag && !pSliceHead->bFieldPicFlag ){
+			pSliceHead->iDeltaPicOrderCntBottom	= BsGetSe(pBs);
+		}
+	}
+	else if(pSps->uiPocType == 1 && !pSps->bDeltaPicOrderAlwaysZeroFlag ){
+		pSliceHead->iDeltaPicOrderCnt[0]	= BsGetSe(pBs);
+		if( pPps->bPicOrderPresentFlag && !pSliceHead->bFieldPicFlag )
+			pSliceHead->iDeltaPicOrderCnt[1]= BsGetSe(pBs);
+	}
+	
+	pSliceHead->iRedundantPicCnt	= 0;
+	if( pPps->bRedundantPicCntPresentFlag ){
+		pSliceHead->iRedundantPicCnt = BsGetUe(pBs);
+	}
+
+    //set defaults, might be overriden a few line later
+	pSliceHead->uiRefCount[0]	= pPps->uiNumRefIdxL0Active;
+	pSliceHead->uiRefCount[1]	= pPps->uiNumRefIdxL1Active;
+	if ( kbExtensionFlag ){
+		uiQualityId = pNalHeaderExt->uiQualityId;
+		if ( BASE_QUALITY_ID == uiQualityId && (EP_SLICE == uiSliceType || EB_SLICE == uiSliceType) ){
+			const bool_t kbBipredFlag = (EB_SLICE == uiSliceType);
+			if ( kbBipredFlag )
+            {
+                WelsLog ( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): kbBipredFlag = 1 not supported.\n");
+                return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_BIPRED);
+            }
+			pSliceHead->bNumRefIdxActiveOverrideFlag	= !!BsGetOneBit(pBs);
+			if ( pSliceHead->bNumRefIdxActiveOverrideFlag ){
+				pSliceHead->uiRefCount[0]	= 1 + BsGetUe(pBs);
+			}
+		}
+	}
+	else if(uiSliceType == P_SLICE || uiSliceType == SP_SLICE || uiSliceType == B_SLICE){
+		const bool_t kbBipredFlag = (B_SLICE == uiSliceType);
+        if( kbBipredFlag ){
+            WelsLog ( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): kbBipredFlag = 1 not supported.\n");
+            return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_BIPRED);
+        }
+		pSliceHead->bNumRefIdxActiveOverrideFlag	= !!BsGetOneBit(pBs);
+		if( pSliceHead->bNumRefIdxActiveOverrideFlag ){
+			pSliceHead->uiRefCount[0]	= 1 + BsGetUe(pBs);
+		}
+    }
+
+	if( pSliceHead->uiRefCount[0] > MAX_REF_PIC_COUNT || pSliceHead->uiRefCount[1] > MAX_REF_PIC_COUNT){
+		WelsLog( pCtx, WELS_LOG_WARNING, "reference overflow\n");
+		return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_REF_COUNT_OVERFLOW);
+	}
+
+	if ( BASE_QUALITY_ID == uiQualityId ){
+		iRet = ParseRefPicListReordering(pBs, pSliceHead);
+		if (iRet != ERR_NONE){
+			WelsLog( pCtx, WELS_LOG_WARNING, "invalid ref pPic list reordering syntaxs!\n");
+			return iRet;
+		}
+
+		if ( kbExtensionFlag ){
+			if ( pNalHeaderExt->iNoInterLayerPredFlag || pNalHeaderExt->uiQualityId > 0 )
+				pSliceHeadExt->bBasePredWeightTableFlag	= false;
+			else
+				pSliceHeadExt->bBasePredWeightTableFlag	= true;
+		}
+
+		if( kpCurNal->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc != 0 ){
+			iRet = ParseDecRefPicMarking(pCtx, pBs, pSliceHead, pSps, bIdrFlag );
+			if (iRet != ERR_NONE){
+				return iRet;
+			}
+
+			if ( kbExtensionFlag && !pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag ){
+				pSliceHeadExt->bStoreRefBasePicFlag	= !!BsGetOneBit(pBs);
+				if ( (pNalHeaderExt->bUseRefBasePicFlag || pSliceHeadExt->bStoreRefBasePicFlag) && !bIdrFlag ){
+                    WelsLog( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): bUseRefBasePicFlag or bStoreRefBasePicFlag = 1 not supported.\n" );
+                    return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_ILP);
+				}
+			}
+		}
+	}
+	
+	if ( pPps->bEntropyCodingModeFlag ){
+        WelsLog( pCtx, WELS_LOG_WARNING, "ParseSliceHeaderSyntaxs(): CABAC in Enhancement layer not supported.\n" );
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_CABAC_EL);
+	}	
+	
+	pSliceHead->iSliceQpDelta	= BsGetSe(pBs);
+	pSliceHead->iSliceQp		= pPps->iPicInitQp + pSliceHead->iSliceQpDelta;	
+    if( pSliceHead->iSliceQp < 0 || pSliceHead->iSliceQp > 51 ){
+        WelsLog( pCtx, WELS_LOG_WARNING, "QP %d out of range\n", pSliceHead->iSliceQp);
+        return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_INVALID_QP);
+    }
+	
+    //FIXME qscale / qp ... stuff
+	if ( !kbExtensionFlag ){
+        if( uiSliceType == SP_SLICE || uiSliceType == SI_SLICE )
+        {
+            WelsLog( pCtx, WELS_LOG_WARNING, "SP/SI not supported\n");
+            return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_SPSI);
+        }
+	}
+
+	pSliceHead->uiDisableDeblockingFilterIdc	= 0;
+	pSliceHead->iSliceAlphaC0Offset			= 0;
+	pSliceHead->iSliceBetaOffset				= 0;
+	if ( pPps->bDeblockingFilterControlPresentFlag ){
+		pSliceHead->uiDisableDeblockingFilterIdc	= BsGetUe(pBs);
+		//refer to JVT-X201wcm1.doc G.7.4.3.4--2010.4.20
+		if ( pSliceHead->uiDisableDeblockingFilterIdc > 6 )
+		{
+			WelsLog( pCtx, WELS_LOG_WARNING, "disable_deblock_filter_idc (%d) out of range [0, 6]\n", pSliceHead->uiDisableDeblockingFilterIdc );
+			return ERR_INFO_INVALID_DBLOCKING_IDC;
+		}
+		if (pSliceHead->uiDisableDeblockingFilterIdc != 1){
+			pSliceHead->iSliceAlphaC0Offset	= BsGetSe(pBs) << 1;	// slice_alpha_c0_offset_div2
+			pSliceHead->iSliceBetaOffset		= BsGetSe(pBs) << 1;	// iSliceBetaOffset
+		}
+	}
+	
+	bSgChangeCycleInvolved	= (pPps->uiNumSliceGroups > 1 && pPps->uiSliceGroupMapType >= 3 && pPps->uiSliceGroupMapType <= 5);
+	if ( kbExtensionFlag && bSgChangeCycleInvolved )
+		bSgChangeCycleInvolved= (bSgChangeCycleInvolved && (uiQualityId == BASE_QUALITY_ID));
+	if ( bSgChangeCycleInvolved ){
+		if ( pPps->uiSliceGroupChangeRate > 0 ){
+			const int32_t kiNumBits = (int32_t)WELS_CEIL(log(static_cast<double>(1 + pPps->uiPicSizeInMapUnits / pPps->uiSliceGroupChangeRate)));
+			pSliceHead->iSliceGroupChangeCycle	= BsGetBits(pBs, kiNumBits);	// For FMO extra types
+		}
+		else
+			pSliceHead->iSliceGroupChangeCycle	= 0;
+	}   
+
+	if ( !kbExtensionFlag ){
+		FillDefaultSliceHeaderExt ( pSliceHeadExt, pNalHeaderExt );
+	}
+	else{
+		/* Extra syntax elements newly introduced */
+		pSliceHeadExt->pSubsetSps	= pSubsetSps;
+		
+		if ( !pNalHeaderExt->iNoInterLayerPredFlag && BASE_QUALITY_ID == uiQualityId ){
+            //the following should be deleted for CODE_CLEAN
+			pSliceHeadExt->uiRefLayerDqId	= BsGetUe(pBs);
+			if ( pSubsetSps->sSpsSvcExt.bInterLayerDeblockingFilterCtrlPresentFlag )
+			{
+				pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc	= BsGetUe(pBs);
+				//refer to JVT-X201wcm1.doc G.7.4.3.4--2010.4.20
+				if ( pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc > 6 ) 
+				{
+					WelsLog( pCtx, WELS_LOG_WARNING, "disable_inter_layer_deblock_filter_idc (%d) out of range [0, 6]\n", pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc );
+					return ERR_INFO_INVALID_DBLOCKING_IDC;
+				}
+				if ( pSliceHeadExt->uiDisableInterLayerDeblockingFilterIdc != 1 ){
+					pSliceHeadExt->iInterLayerSliceAlphaC0Offset	= BsGetSe(pBs) << 1;
+					pSliceHeadExt->iInterLayerSliceBetaOffset		= BsGetSe(pBs) << 1;
+				}
+			}
+
+			pSliceHeadExt->uiRefLayerChromaPhaseXPlus1Flag	= pSubsetSps->sSpsSvcExt.uiSeqRefLayerChromaPhaseXPlus1Flag;
+			pSliceHeadExt->uiRefLayerChromaPhaseYPlus1		= pSubsetSps->sSpsSvcExt.uiSeqRefLayerChromaPhaseYPlus1;
+
+			pSliceHeadExt->bConstrainedIntraResamplingFlag	= !!BsGetOneBit(pBs);
+
+            {
+                SPosOffset pos;
+			    pos.iLeftOffset	= pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iLeftOffset;
+			    pos.iTopOffset	= pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iTopOffset * (2 - pSps->bFrameMbsOnlyFlag);
+			    pos.iRightOffset= pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iRightOffset;
+			    pos.iBottomOffset=pSubsetSps->sSpsSvcExt.sSeqScaledRefLayer.iBottomOffset * (2 - pSps->bFrameMbsOnlyFlag);				
+			    //memcpy(&pSliceHeadExt->sScaledRefLayer, &pos, sizeof(SPosOffset));//confirmed_safe_unsafe_usage
+			    pSliceHeadExt->iScaledRefLayerPicWidthInSampleLuma	= (pSliceHead->iMbWidth << 4) - (pos.iLeftOffset+pos.iRightOffset);
+			    pSliceHeadExt->iScaledRefLayerPicHeightInSampleLuma	= (pSliceHead->iMbHeight << 4) - (pos.iTopOffset+pos.iBottomOffset) / (1 + pSliceHead->bFieldPicFlag);
+            }
+		}
+		else if (uiQualityId > BASE_QUALITY_ID){
+            WelsLog( pCtx, WELS_LOG_WARNING, "MGS not supported.\n" );
+            return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MGS);
+			pSliceHeadExt->uiRefLayerDqId	= pNalHeaderExt->uiLayerDqId - 1;
+		}
+		else{
+			pSliceHeadExt->uiRefLayerDqId	= (uint8_t)-1;
+		}
+
+		pSliceHeadExt->bSliceSkipFlag	= false;
+		pSliceHeadExt->bAdaptiveBaseModeFlag	= false;
+		pSliceHeadExt->bDefaultBaseModeFlag	= false;
+		pSliceHeadExt->bAdaptiveMotionPredFlag	= false;
+		pSliceHeadExt->bDefaultMotionPredFlag	= false;
+		pSliceHeadExt->bAdaptiveResidualPredFlag	= false;
+		pSliceHeadExt->bDefaultResidualPredFlag	= false;
+		if ( pNalHeaderExt->iNoInterLayerPredFlag )
+			pSliceHeadExt->bTCoeffLevelPredFlag	= false;
+		else
+			pSliceHeadExt->bTCoeffLevelPredFlag	= pSubsetSps->sSpsSvcExt.bSeqTCoeffLevelPredFlag;
+
+		if ( !pNalHeaderExt->iNoInterLayerPredFlag ){
+			pSliceHeadExt->bSliceSkipFlag	= !!BsGetOneBit(pBs);
+			if ( pSliceHeadExt->bSliceSkipFlag ){
+				pSliceHeadExt->uiNumMbsInSlice	= 1 + BsGetUe(pBs);
+			}
+			else{
+				pSliceHeadExt->bAdaptiveBaseModeFlag	= !!BsGetOneBit(pBs);
+				if ( !pSliceHeadExt->bAdaptiveBaseModeFlag ){
+					pSliceHeadExt->bDefaultBaseModeFlag	= !!BsGetOneBit(pBs);
+				}
+				if ( !pSliceHeadExt->bDefaultBaseModeFlag ){
+					pSliceHeadExt->bAdaptiveMotionPredFlag	= !!BsGetOneBit(pBs);
+					if ( !pSliceHeadExt->bAdaptiveMotionPredFlag )
+						pSliceHeadExt->bDefaultMotionPredFlag	= !!BsGetOneBit(pBs);
+				}
+
+				pSliceHeadExt->bAdaptiveResidualPredFlag	= !!BsGetOneBit(pBs);
+				if ( !pSliceHeadExt->bAdaptiveResidualPredFlag ){
+					pSliceHeadExt->bDefaultResidualPredFlag = !!BsGetOneBit(pBs);
+				}
+			}
+			if ( pSubsetSps->sSpsSvcExt.bAdaptiveTCoeffLevelPredFlag )
+				pSliceHeadExt->bTCoeffLevelPredFlag	= !!BsGetOneBit(pBs);
+		}
+
+		if ( !pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag )
+		{
+			pSliceHeadExt->uiScanIdxStart	= BsGetBits(pBs, 4);
+			pSliceHeadExt->uiScanIdxEnd	= BsGetBits(pBs, 4);
+            if( pSliceHeadExt->uiScanIdxStart != 0 || pSliceHeadExt->uiScanIdxEnd != 15 )
+            {
+                WelsLog( pCtx, WELS_LOG_WARNING, "uiScanIdxStart (%d) != 0 and uiScanIdxEnd (%d) !=15 not supported here\n", pSliceHeadExt->uiScanIdxStart, pSliceHeadExt->uiScanIdxEnd );
+                return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_UNSUPPORTED_MGS);
+            }
+		}
+		else{
+			pSliceHeadExt->uiScanIdxStart	= 0;
+			pSliceHeadExt->uiScanIdxEnd	= 15;
+		}
+	}
+
+	return ERR_NONE;
+}
+
+/*
+ *	Copy relative syntax elements of NALUnitHeaderExt, sRefPicBaseMarking and bStoreRefBasePicFlag in prefix nal unit.
+ *	pSrc:	mark as decoded prefix NAL
+ *	ppDst:	succeeded VCL NAL based AVC (I/P Slice)
+ */
+bool_t PrefetchNalHeaderExtSyntax ( PWelsDecoderContext pCtx, PNalUnit const kppDst, PNalUnit const kpSrc)
+{
+	PNalUnitHeaderExt pNalHdrExtD	= NULL, pNalHdrExtS = NULL;
+	PSliceHeaderExt pShExtD = NULL;
+	PPrefixNalUnit pPrefixS = NULL;
+	PSps pSps = NULL;
+	int32_t iIdx = 0;
+	
+	if ( kppDst == NULL || kpSrc == NULL )
+		return false;	
+	
+	pNalHdrExtD	= &kppDst->sNalHeaderExt;
+	pNalHdrExtS	= &kpSrc->sNalHeaderExt;
+	pShExtD		= &kppDst->sNalData.sVclNal.sSliceHeaderExt;
+	pPrefixS		= &kpSrc->sNalData.sPrefixNal;
+	pSps			= &pCtx->sSpsBuffer[pCtx->sPpsBuffer[pShExtD->sSliceHeader.iPpsId].iSpsId];
+	
+	pNalHdrExtD->uiDependencyId	    = pNalHdrExtS->uiDependencyId;
+	pNalHdrExtD->uiQualityId		= pNalHdrExtS->uiQualityId;
+	pNalHdrExtD->uiTemporalId		= pNalHdrExtS->uiTemporalId;
+	pNalHdrExtD->uiPriorityId		= pNalHdrExtS->uiPriorityId;
+	pNalHdrExtD->bIdrFlag			= pNalHdrExtS->bIdrFlag;
+	pNalHdrExtD->iNoInterLayerPredFlag	= pNalHdrExtS->iNoInterLayerPredFlag;
+	pNalHdrExtD->bDiscardableFlag			= pNalHdrExtS->bDiscardableFlag;
+	pNalHdrExtD->bOutputFlag				= pNalHdrExtS->bOutputFlag;
+	pNalHdrExtD->bUseRefBasePicFlag	= pNalHdrExtS->bUseRefBasePicFlag;
+	pNalHdrExtD->uiLayerDqId				= pNalHdrExtS->uiLayerDqId;
+	
+	pShExtD->bStoreRefBasePicFlag		= pPrefixS->bStoreRefBasePicFlag;
+	memcpy(&pShExtD->sRefBasePicMarking, &pPrefixS->sRefPicBaseMarking, sizeof(SRefBasePicMarking));//confirmed_safe_unsafe_usage
+	if (pShExtD->sRefBasePicMarking.bAdaptiveRefBasePicMarkingModeFlag){
+		PRefBasePicMarking pRefBasePicMarking = &pShExtD->sRefBasePicMarking;
+		iIdx = 0;
+		do {
+			if (pRefBasePicMarking->mmco_base[iIdx].uiMmcoType == MMCO_END)
+				break;
+			if (pRefBasePicMarking->mmco_base[iIdx].uiMmcoType == MMCO_SHORT2UNUSED)
+				pRefBasePicMarking->mmco_base[iIdx].iShortFrameNum = (pShExtD->sSliceHeader.iFrameNum-pRefBasePicMarking->mmco_base[iIdx].uiDiffOfPicNums) & ((1<<pSps->uiLog2MaxFrameNum)-1);
+			++ iIdx;
+		} while(iIdx < MAX_MMCO_COUNT);
+	}
+	
+	return true;
+}
+
+
+
+int32_t UpdateAccessUnit ( PWelsDecoderContext pCtx )
+{
+	PAccessUnit pCurAu	= pCtx->pAccessUnitList;	
+	int32_t iIdx         = pCurAu->uiEndPos;
+	
+	// Conversed iterator
+	pCtx->uiTargetDqId = pCurAu->pNalUnitsList[iIdx]->sNalHeaderExt.uiLayerDqId;
+	pCurAu->uiActualUnitsNum  = iIdx + 1;
+	pCurAu->bCompletedAuFlag = true;	
+
+	// Added for mosaic avoidance, 11/19/2009
+#ifdef LONG_TERM_REF
+	if ( pCtx->bParamSetsLostFlag )
+#else
+	if ( pCtx->bReferenceLostAtT0Flag )
+#endif 
+	{
+		uint32_t uiActualIdx = 0;
+		while ( uiActualIdx < pCurAu->uiActualUnitsNum ) {
+			PNalUnit nal = pCurAu->pNalUnitsList[uiActualIdx];
+
+			if ( nal->sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR || nal->sNalHeaderExt.bIdrFlag )
+			{
+				break;
+			}
+			++ uiActualIdx;
+		}
+		if ( uiActualIdx == pCurAu->uiActualUnitsNum )	// no found IDR nal within incoming AU, need exit to avoid mosaic issue, 11/19/2009
+		{
+			WelsLog( pCtx, WELS_LOG_WARNING, "UpdateAccessUnit():::::Key frame lost.....CAN NOT find IDR from current AU.\n" );
+#ifdef LONG_TERM_REF
+			pCtx->iErrorCode |= dsNoParamSets;
+			return dsNoParamSets;
+#else
+			pCtx->iErrorCode |= dsRefLost;
+			return ERR_INFO_REFERENCE_PIC_LOST;
+#endif			
+		}
+	}		
+	
+	return ERR_NONE;
+}
+
+int32_t InitialDqLayersContext ( PWelsDecoderContext pCtx, const int32_t kiMaxWidth, const int32_t kiMaxHeight )
+{
+	const int32_t kiPicStride		= ((kiMaxWidth + 15) & 0xfffff0) + (PADDING_LENGTH<<1);
+	const int32_t kiPicLines		= ((kiMaxHeight + 15) & 0xfffff0);
+		
+	int32_t i = 0;
+
+	WELS_VERIFY_RETURN_IF( ERR_INFO_INVALID_PARAM, ( NULL == pCtx || kiMaxWidth <= 0 || kiMaxHeight <= 0 ) )
+	pCtx->sMb.iMbWidth		= (kiMaxWidth + 15) >> 4;
+	pCtx->sMb.iMbHeight		= (kiMaxHeight + 15) >> 4;
+
+	if ( pCtx->bInitialDqLayersMem && kiMaxWidth <= pCtx->iPicWidthReq && kiMaxHeight <= pCtx->iPicHeightReq )	// have same dimension memory, skipped
+		return ERR_NONE;
+
+	
+		UninitialDqLayersContext( pCtx );
+	
+		do {
+		PDqLayer pDq = (PDqLayer )WelsMalloc(sizeof(SDqLayer), "PDqLayer");
+
+		int32_t iPlaneIdx = 0;
+
+		if ( pDq == NULL )
+			return ERR_INFO_OUT_OF_MEMORY;
+		
+		memset(pDq, 0, sizeof(SDqLayer));
+		if(pCtx->iDecoderMode == SW_MODE)
+		{
+		
+		do {
+			const int32_t kiHshift	= iPlaneIdx ? 1 : 0;
+			const int32_t kiVshift	= kiHshift;
+			const int32_t kiStride	= WELS_ALIGN( (kiPicStride >> kiHshift), (16 << (1-kiHshift)) );
+			const int32_t kiLine	= (kiPicLines + (PADDING_LENGTH<<1)) >> kiVshift;
+			const int32_t kiSize	= kiStride * kiLine;
+
+			pCtx->pCsListXchg[i][iPlaneIdx]	= (uint8_t *)WelsMalloc( kiSize * sizeof(uint8_t), "pCtx->pCsListXchg[][]" );
+
+			WELS_VERIFY_RETURN_IF( ERR_INFO_OUT_OF_MEMORY, (NULL == pCtx->pCsListXchg[i][iPlaneIdx]) )
+			pCtx->iCsStride[iPlaneIdx]	= kiStride;
+
+
+			pCtx->pRsListXchg[i][iPlaneIdx]	= (int16_t *)WelsMalloc( kiSize * sizeof(int16_t), "pCtx->pRsListXchg[][]" );
+
+			WELS_VERIFY_RETURN_IF( ERR_INFO_OUT_OF_MEMORY , (NULL == pCtx->pRsListXchg[i][iPlaneIdx]) )
+			pCtx->iRsStride[iPlaneIdx]	= kiStride;
+	
+			++ iPlaneIdx;
+		} while(iPlaneIdx < 3);
+
+
+		pCtx->sMb.pMbType[i] = (int8_t *)WelsMalloc( pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *sizeof(int8_t), "pCtx->sMb.pMbType[]" );
+		pCtx->sMb.pMv[i][0] = (int16_t (*)[16][2])WelsMalloc( pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int16_t) * MV_A * MB_BLOCK4x4_NUM, "pCtx->sMb.pMv[][]"); 
+		pCtx->sMb.pRefIndex[i][0] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMalloc( pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pRefIndex[][]");
+		pCtx->sMb.pLumaQp[i] = (int8_t *)WelsMalloc( pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pLumaQp[]");
+		pCtx->sMb.pChromaQp[i] = (int8_t *)WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pChromaQp[]");
+        pCtx->sMb.pNzc[i] = (int8_t (*)[24])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * 24, "pCtx->sMb.pNzc[]");
+		pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * 24, "pCtx->sMb.pNzcRs[]");
+		pCtx->sMb.pScaledTCoeff[i] = (int16_t (*)[MB_COEFF_LIST_SIZE])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int16_t) * MB_COEFF_LIST_SIZE, "pCtx->sMb.pScaledTCoeff[]"); 
+		pCtx->sMb.pIntraPredMode[i] = (int8_t (*)[8])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * 8, "pCtx->sMb.pIntraPredMode[]");
+		pCtx->sMb.pIntra4x4FinalMode[i] = (int8_t (*)[MB_BLOCK4x4_NUM])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * MB_BLOCK4x4_NUM, "pCtx->sMb.pIntra4x4FinalMode[]");	
+		pCtx->sMb.pChromaPredMode[i] = (int8_t *)WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pChromaPredMode[]");
+		pCtx->sMb.pCbp[i] = (int8_t *)WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *sizeof(int8_t), "pCtx->sMb.pCbp[]");
+		pCtx->sMb.pSubMbType[i] = (int8_t (*)[MB_PARTITION_SIZE])WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t) * MB_PARTITION_SIZE, "pCtx->sMb.pSubMbType[]");
+		pCtx->sMb.pSliceIdc[i] = (int32_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int32_t), "pCtx->sMb.pSliceIdc[]");	// using int32_t for slice_idc, 4/21/2010
+		if ( pCtx->sMb.pSliceIdc[i] != NULL )
+			memset(pCtx->sMb.pSliceIdc[i], 0xff, (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int32_t)) );
+		pCtx->sMb.pResidualPredFlag[i] = (int8_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pResidualPredFlag[]");
+		//pCtx->sMb.pMotionPredFlag[i] = (uint8_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(uint8_t), "pCtx->sMb.pMotionPredFlag[]");
+		pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t *) WelsMalloc(pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int8_t), "pCtx->sMb.pInterPredictionDoneFlag[]");
+
+		// check memory block valid due above allocated..
+		WELS_VERIFY_RETURN_IF	(	ERR_INFO_OUT_OF_MEMORY,
+                   			       ( (NULL == pCtx->sMb.pMbType[i]) ||
+								     (NULL == pCtx->sMb.pMv[i][0]) ||
+									 (NULL == pCtx->sMb.pRefIndex[i][0]) ||
+									 (NULL == pCtx->sMb.pLumaQp[i]) ||
+									 (NULL == pCtx->sMb.pChromaQp[i]) ||
+									 (NULL == pCtx->sMb.pNzc[i]) ||
+									 (NULL == pCtx->sMb.pNzcRs[i]) ||
+									 (NULL == pCtx->sMb.pScaledTCoeff[i]) ||
+									 (NULL == pCtx->sMb.pIntraPredMode[i]) ||
+									 (NULL == pCtx->sMb.pIntra4x4FinalMode[i]) ||
+									 (NULL == pCtx->sMb.pChromaPredMode[i]) ||
+									 (NULL == pCtx->sMb.pCbp[i]) ||
+									 (NULL == pCtx->sMb.pSubMbType[i]) ||
+									 (NULL == pCtx->sMb.pSliceIdc[i]) ||
+									 (NULL == pCtx->sMb.pResidualPredFlag[i]) ||
+									 (NULL == pCtx->sMb.pInterPredictionDoneFlag[i])
+									)
+								)
+		} // end of if(pCtx->iDecoderMode == SW_MODE)
+		
+		pCtx->pDqLayersList[i] = pDq;
+		++ i;
+	} while( i < LAYER_NUM_EXCHANGEABLE );
+	
+
+	pCtx->bInitialDqLayersMem	= true;
+	pCtx->iPicWidthReq			= kiMaxWidth;
+	pCtx->iPicHeightReq			= kiMaxHeight;
+	
+	return ERR_NONE;
+}
+
+void_t UninitialDqLayersContext ( PWelsDecoderContext pCtx )
+{
+	int32_t i = 0;
+	int32_t j = 0;
+	
+	do {
+		PDqLayer pDq = pCtx->pDqLayersList[i];		
+		if ( pDq == NULL ){
+			++ i;
+			continue;
+		}
+		
+		if ( pCtx->pCsListXchg[i] ){	// cs picture
+			j = 0;
+			do {
+				if ( NULL != pCtx->pCsListXchg[i][j] )
+				{
+					WelsFree( pCtx->pCsListXchg[i][j], "pCtx->pCsListXchg[][]" );
+
+					pCtx->pCsListXchg[i][j] = NULL;
+				}
+				pCtx->iCsStride[j]	= 0;
+				++ j;
+			} while( j < 3 );			
+			
+			pDq->pCsData[i]		= NULL;	// for safe
+			pDq->iCsStride[i]	= 0;
+		}
+		if ( pCtx->pRsListXchg[i] ){			
+			j = 0;
+			do {
+				if ( NULL != pCtx->pRsListXchg[i][j] )
+				{
+					WelsFree( pCtx->pRsListXchg[i][j], "pCtx->pRsListXchg[][]" );
+
+					pCtx->pRsListXchg[i][j]	= NULL;
+				}
+				pCtx->iRsStride[j]	= 0;
+				++ j;
+			} while(j < 3);
+		}
+
+		if (pCtx->sMb.pMbType[i])
+		{	
+			WelsFree( pCtx->sMb.pMbType[i], "pCtx->sMb.pMbType[]");
+
+			pCtx->sMb.pMbType[i] = NULL;	
+		}
+		
+		if (pCtx->sMb.pMv[i][0])
+		{
+			WelsFree( pCtx->sMb.pMv[i][0], "pCtx->sMb.pMv[][]" );
+
+			pCtx->sMb.pMv[i][0] = NULL;
+		}
+
+		if (pCtx->sMb.pRefIndex[i][0])
+		{
+			WelsFree( pCtx->sMb.pRefIndex[i][0], "pCtx->sMb.pRefIndex[][]" );
+
+			pCtx->sMb.pRefIndex[i][0] = NULL;
+		}
+
+		if (pCtx->sMb.pLumaQp[i])
+		{
+			WelsFree(pCtx->sMb.pLumaQp[i], "pCtx->sMb.pLumaQp[]");
+
+			pCtx->sMb.pLumaQp[i] = NULL;
+		}
+
+		if (pCtx->sMb.pChromaQp[i])
+		{
+			WelsFree(pCtx->sMb.pChromaQp[i], "pCtx->sMb.pChromaQp[]");
+
+			pCtx->sMb.pChromaQp[i] = NULL;
+		}
+		
+		if (pCtx->sMb.pNzc[i])
+		{
+			WelsFree(pCtx->sMb.pNzc[i], "pCtx->sMb.pNzc[]");
+
+			pCtx->sMb.pNzc[i] = NULL;
+		}
+
+		if (pCtx->sMb.pNzcRs[i])
+		{
+			WelsFree(pCtx->sMb.pNzcRs[i], "pCtx->sMb.pNzcRs[]");
+
+			pCtx->sMb.pNzcRs[i] = NULL;
+		}		
+
+		if (pCtx->sMb.pScaledTCoeff[i])
+		{
+			WelsFree(pCtx->sMb.pScaledTCoeff[i], "pCtx->sMb.pScaledTCoeff[]");
+
+			pCtx->sMb.pScaledTCoeff[i] = NULL;
+		}
+
+		if (pCtx->sMb.pIntraPredMode[i])
+		{
+			WelsFree(pCtx->sMb.pIntraPredMode[i], "pCtx->sMb.pIntraPredMode[]");
+
+			pCtx->sMb.pIntraPredMode[i] = NULL;
+		}
+
+		if (pCtx->sMb.pIntra4x4FinalMode[i])
+		{
+			WelsFree(pCtx->sMb.pIntra4x4FinalMode[i], "pCtx->sMb.pIntra4x4FinalMode[]");
+
+			pCtx->sMb.pIntra4x4FinalMode[i] = NULL;
+		}			
+
+		if (pCtx->sMb.pChromaPredMode[i])
+		{
+			WelsFree(pCtx->sMb.pChromaPredMode[i], "pCtx->sMb.pChromaPredMode[]");
+
+			pCtx->sMb.pChromaPredMode[i] = NULL;
+		}
+	
+		if (pCtx->sMb.pCbp[i])
+		{
+			WelsFree( pCtx->sMb.pCbp[i], "pCtx->sMb.pCbp[]" );
+
+			pCtx->sMb.pCbp[i] = NULL;
+		}
+
+  //      if (pCtx->sMb.pMotionPredFlag[i])
+		//{
+		//	WelsFree( pCtx->sMb.pMotionPredFlag[i], "pCtx->sMb.pMotionPredFlag[]" );
+
+		//	pCtx->sMb.pMotionPredFlag[i] = NULL;
+		//}
+
+		if (pCtx->sMb.pSubMbType[i])
+		{
+			WelsFree(pCtx->sMb.pSubMbType[i], "pCtx->sMb.pSubMbType[]");
+
+			pCtx->sMb.pSubMbType[i] = NULL;
+		}
+
+		if (pCtx->sMb.pSliceIdc[i])
+		{
+			WelsFree( pCtx->sMb.pSliceIdc[i], "pCtx->sMb.pSliceIdc[]" );
+
+			pCtx->sMb.pSliceIdc[i] = NULL;
+		}
+
+       if (pCtx->sMb.pResidualPredFlag[i])
+		{
+			WelsFree( pCtx->sMb.pResidualPredFlag[i], "pCtx->sMb.pResidualPredFlag[]" );
+
+			pCtx->sMb.pResidualPredFlag[i] = NULL;
+		}
+
+		if (pCtx->sMb.pInterPredictionDoneFlag[i])
+		{
+			WelsFree( pCtx->sMb.pInterPredictionDoneFlag[i], "pCtx->sMb.pInterPredictionDoneFlag[]" );
+
+			pCtx->sMb.pInterPredictionDoneFlag[i] = NULL;
+		}
+		WelsFree( pDq, "pDq" );
+
+		pDq = NULL;
+		pCtx->pDqLayersList[i] = NULL;
+		
+		++ i;
+	} while( i < LAYER_NUM_EXCHANGEABLE );
+	
+	pCtx->iPicWidthReq			= 0;
+	pCtx->iPicHeightReq			= 0;
+	pCtx->bInitialDqLayersMem	= false;
+}
+
+void_t ResetCurrentAccessUnit ( PWelsDecoderContext pCtx )
+{
+	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+	
+	pCurAu->uiEndPos		= 0;
+	pCurAu->bCompletedAuFlag	= false;
+	if (pCurAu->uiActualUnitsNum > 0){
+		uint32_t iIdx = 0;
+		const uint32_t kuiActualNum = pCurAu->uiActualUnitsNum;		
+		// a more simpler method to do nal units list management prefered here		
+		const uint32_t kuiAvailNum	= pCurAu->uiAvailUnitsNum;
+		const uint32_t kuiLeftNum	= kuiAvailNum - kuiActualNum;		
+		
+		// Swapping active nal unit nodes of succeeding AU with leading of list
+		while (iIdx < kuiLeftNum)
+		{
+			PNalUnit t = pCurAu->pNalUnitsList[kuiActualNum+iIdx];
+			pCurAu->pNalUnitsList[kuiActualNum+iIdx] = pCurAu->pNalUnitsList[iIdx];
+			pCurAu->pNalUnitsList[iIdx] = t;
+			++ iIdx;
+		}
+		pCurAu->uiActualUnitsNum = pCurAu->uiAvailUnitsNum	= kuiLeftNum;
+	}
+}
+
+/*!
+ * \brief	Force reset current Acess Unit Nal list in case error parsing/decoding in current AU
+ * \author
+ * \history	11/16/2009
+ */
+void_t ForceResetCurrentAccessUnit( PAccessUnit pAu )
+{
+	uint32_t uiSucAuIdx	= pAu->uiEndPos + 1;
+	uint32_t uiCurAuIdx	= 0;
+
+	// swap the succeeding AU's nal units to the front
+	while(uiSucAuIdx < pAu->uiAvailUnitsNum)
+	{
+		PNalUnit t = pAu->pNalUnitsList[uiSucAuIdx];
+		pAu->pNalUnitsList[uiSucAuIdx]	= pAu->pNalUnitsList[uiCurAuIdx];
+		pAu->pNalUnitsList[uiCurAuIdx]	= t;
+		++ uiSucAuIdx;
+		++ uiCurAuIdx;
+	}
+
+	// Update avail/actual units num accordingly for next AU parsing
+	if ( pAu->uiAvailUnitsNum > pAu->uiEndPos )
+		pAu->uiAvailUnitsNum	-= (pAu->uiEndPos+1);
+	else
+		pAu->uiAvailUnitsNum	= 0;
+	pAu->uiActualUnitsNum	= 0;
+	pAu->uiEndPos		= 0;
+	pAu->bCompletedAuFlag	= false;	
+}
+
+//clear current corrupted NAL from pNalUnitsList
+void_t ForceClearCurrentNal( PAccessUnit pAu )
+{	
+	if (pAu->uiAvailUnitsNum > 0)
+		-- pAu->uiAvailUnitsNum;
+}
+
+
+void_t CheckAvailNalUnitsListContinuity( PWelsDecoderContext pCtx, int32_t iStartIdx, int32_t iEndIdx )
+{
+	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+
+	uint8_t uiLastNuDependencyId, uiLastNuLayerDqId;
+	uint8_t uiCurNuDependencyId, uiCurNuQualityId, uiCurNuLayerDqId, uiCurNuRefLayerDqId;
+
+	int32_t iCurNalUnitIdx = 0;	
+	
+	//check the continuity of pNalUnitsList forwards (from pIdxNoInterLayerPred to end_postion)	
+	uiLastNuDependencyId = pCurAu->pNalUnitsList[iStartIdx]->sNalHeaderExt.uiDependencyId;//starting nal unit
+	uiLastNuLayerDqId   = pCurAu->pNalUnitsList[iStartIdx]->sNalHeaderExt.uiLayerDqId;//starting nal unit
+	iCurNalUnitIdx = iStartIdx + 1;//current nal unit
+	while ( iCurNalUnitIdx <= iEndIdx )
+	{
+		uiCurNuDependencyId   = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiDependencyId;
+		uiCurNuQualityId      = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiQualityId;
+		uiCurNuLayerDqId     = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiLayerDqId;
+		uiCurNuRefLayerDqId = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalData.sVclNal.sSliceHeaderExt.uiRefLayerDqId;
+		
+		if ( uiCurNuDependencyId == uiLastNuDependencyId ) 
+		{
+			uiLastNuLayerDqId = uiCurNuLayerDqId;
+			++ iCurNalUnitIdx;
+		}
+		else //uiCurNuDependencyId != uiLastNuDependencyId, new dependency arrive
+		{
+			if ( uiCurNuQualityId == 0 ) 
+			{
+				uiLastNuDependencyId = uiCurNuDependencyId;
+				if ( uiCurNuRefLayerDqId == uiLastNuLayerDqId )					
+				{
+					uiLastNuLayerDqId = uiCurNuLayerDqId;
+					++ iCurNalUnitIdx;
+				}
+				else //cur_nu_layer_id != next_nu_ref_layer_dq_id, the chain is broken at this point
+				{
+					break;
+				}
+			}
+			else //new dependency arrive, but no base quality layer, so we must stop in this point
+			{
+				break;
+			}
+		}
+	}
+	
+	-- iCurNalUnitIdx;
+	pCurAu->uiEndPos = iCurNalUnitIdx;
+	pCtx->uiTargetDqId = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalHeaderExt.uiLayerDqId;	
+}
+
+//main purpose: to support multi-slice and to include all slice which have the same uiDependencyId, uiQualityId and frame_num
+//for single slice, pIdxNoInterLayerPred SHOULD NOT be modified
+void_t RefineIdxNoInterLayerPred( PAccessUnit pCurAu, int32_t* pIdxNoInterLayerPred )
+{
+	int32_t iLastNalDependId  = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiDependencyId;
+	int32_t iLastNalQualityId = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiQualityId;
+	uint8_t uiLastNalTId       = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalHeaderExt.uiTemporalId;
+	int32_t iLastNalFrameNum  = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFrameNum;
+	int32_t iLastNalPoc        = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
+	int32_t iLastNalFirstMb   = pCurAu->pNalUnitsList[*pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
+	int32_t iCurNalDependId, iCurNalQualityId, iCurNalTId, iCurNalFrameNum, iCurNalPoc, iCurNalFirstMb, iCurIdx, iFinalIdxNoInterLayerPred;
+
+	bool_t  bMultiSliceFind = false;
+	
+	iFinalIdxNoInterLayerPred = 0;
+	iCurIdx = *pIdxNoInterLayerPred - 1;
+	while ( iCurIdx >= 0 )
+	{
+		if ( pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.iNoInterLayerPredFlag )
+		{
+			iCurNalDependId  = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
+			iCurNalQualityId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
+			iCurNalTId       = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
+			iCurNalFrameNum  = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFrameNum;
+			iCurNalPoc        = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
+			iCurNalFirstMb   = pCurAu->pNalUnitsList[iCurIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
+			
+			if ( iCurNalDependId == iLastNalDependId  && 
+				iCurNalQualityId == iLastNalQualityId && 
+				iCurNalTId       == uiLastNalTId       &&
+				iCurNalFrameNum  == iLastNalFrameNum  && 
+				iCurNalPoc        == iLastNalPoc        &&
+				iCurNalFirstMb   != iLastNalFirstMb ) 
+			{
+				bMultiSliceFind = true;
+				iFinalIdxNoInterLayerPred = iCurIdx;
+				--iCurIdx;
+				continue;
+			}
+			else
+			{
+				break;
+			}
+		}
+		--iCurIdx;
+	}
+
+	if ( bMultiSliceFind && *pIdxNoInterLayerPred != iFinalIdxNoInterLayerPred )
+	{
+		*pIdxNoInterLayerPred = iFinalIdxNoInterLayerPred;
+	}
+}
+
+bool_t CheckPocOfCurValidNalUnits( PAccessUnit pCurAu, int32_t pIdxNoInterLayerPred )
+{	 
+	int32_t iEndIdx    = pCurAu->uiEndPos;
+	int32_t iCurAuPoc = pCurAu->pNalUnitsList[pIdxNoInterLayerPred]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
+	int32_t iTmpPoc, i;
+	for ( i = pIdxNoInterLayerPred+1; i < iEndIdx; i++ )
+	{
+		iTmpPoc = pCurAu->pNalUnitsList[i]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iPicOrderCntLsb;
+		if ( iTmpPoc != iCurAuPoc )
+		{
+			return false;
+		}
+	}
+
+	return true;
+}
+
+bool_t CheckIntegrityNalUnitsList( PWelsDecoderContext pCtx )
+{
+	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+	const int32_t kiEndPos = pCurAu->uiEndPos;
+	int32_t iIdxNoInterLayerPred = 0;
+	int32_t iCurNalUnitIdx = kiEndPos;
+
+	ESliceType eSliceType = static_cast<ESliceType> (0);//EC 2009.11.12
+	
+	if ( !pCurAu->bCompletedAuFlag )
+		return false;
+
+	eSliceType = pCurAu->pNalUnitsList[iCurNalUnitIdx]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.eSliceType;
+
+	if ( I_SLICE == eSliceType )
+	{
+		pCurAu->uiStartPos = 0;
+		//step1: search the pNalUnit whose iNoInterLayerPredFlag equal to 1 backwards (from uiEndPos to 0)
+		iIdxNoInterLayerPred = kiEndPos;
+		while ( iIdxNoInterLayerPred >= 0 ) 
+		{
+			if ( pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag ) 
+			{
+				break;
+			}
+			--iIdxNoInterLayerPred;
+		}
+		if ( iIdxNoInterLayerPred < 0 )
+		{
+			//can not find the Nal Unit whose no_inter_pred_falg equal to 1, MUST STOP decode
+			return false;
+		}		
+		
+		//step2: support multi-slice, to include all base layer slice
+		RefineIdxNoInterLayerPred( pCurAu, &iIdxNoInterLayerPred );		
+		pCurAu->uiStartPos = iIdxNoInterLayerPred;
+		CheckAvailNalUnitsListContinuity( pCtx, iIdxNoInterLayerPred, kiEndPos );
+
+		if ( !CheckPocOfCurValidNalUnits( pCurAu, iIdxNoInterLayerPred ) ) 
+		{
+			return false;
+		}
+		
+		pCtx->iCurSeqIntervalTargetDependId = pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalHeaderExt.uiDependencyId;
+		pCtx->iCurSeqIntervalMaxPicWidth  = pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iMbWidth << 4;
+		pCtx->iCurSeqIntervalMaxPicHeight = pCurAu->pNalUnitsList[pCurAu->uiEndPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.iMbHeight << 4;
+	}
+	else //P_SLICE
+	{
+		//step 1: search uiDependencyId equal to pCtx->cur_seq_interval_target_dependency_id
+		bool_t bGetDependId = false;
+		int32_t iIdxDependId = 0;
+
+		iIdxDependId = kiEndPos;
+		while ( iIdxDependId >= 0 ) 
+		{
+			if ( pCtx->iCurSeqIntervalTargetDependId == pCurAu->pNalUnitsList[iIdxDependId]->sNalHeaderExt.uiDependencyId )
+			{
+				bGetDependId = true;
+				break;
+			}
+			else
+			{
+				--iIdxDependId;
+			}
+		}
+		
+		//step 2: switch according to whether or not find the index of pNalUnit whose uiDependencyId equal to iCurSeqIntervalTargetDependId
+		if ( bGetDependId ) //get the index of pNalUnit whose uiDependencyId equal to iCurSeqIntervalTargetDependId
+		{
+			bool_t bGetNoInterPredFront = false;
+			//step 2a: search iNoInterLayerPredFlag [0....iIdxDependId]
+			iIdxNoInterLayerPred = iIdxDependId;
+			while ( iIdxNoInterLayerPred >= 0 )
+			{
+				if ( pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag )
+				{
+					bGetNoInterPredFront = true;
+					break;
+				}
+				--iIdxNoInterLayerPred;
+			}
+			//step 2b: switch, whether or not find the NAL unit whose no_inter_pred_flag equal to 1 among [0....iIdxDependId] 
+			if ( bGetNoInterPredFront ) //YES
+			{
+				RefineIdxNoInterLayerPred( pCurAu, &iIdxNoInterLayerPred );
+				pCurAu->uiStartPos = iIdxNoInterLayerPred;
+				CheckAvailNalUnitsListContinuity( pCtx, iIdxNoInterLayerPred, iIdxDependId );
+				
+				if ( !CheckPocOfCurValidNalUnits( pCurAu, iIdxNoInterLayerPred ) ) 
+				{
+					return false;
+				}
+			}
+			else //NO, should find the NAL unit whose no_inter_pred_flag equal to 1 among [iIdxDependId....uiEndPos]
+			{
+				iIdxNoInterLayerPred = iIdxDependId;
+				while ( iIdxNoInterLayerPred <= kiEndPos )
+				{
+					if ( pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag )
+					{
+						break;
+					}					
+					++iIdxNoInterLayerPred;
+				}
+
+				if ( iIdxNoInterLayerPred > kiEndPos )
+				{
+					return false; //cann't find the index of pNalUnit whose no_inter_pred_flag = 1
+				}
+
+				RefineIdxNoInterLayerPred( pCurAu, &iIdxNoInterLayerPred );			
+				pCurAu->uiStartPos = iIdxNoInterLayerPred;
+				CheckAvailNalUnitsListContinuity( pCtx, iIdxNoInterLayerPred, kiEndPos );
+				
+				if ( !CheckPocOfCurValidNalUnits( pCurAu, iIdxNoInterLayerPred ) ) 
+				{
+					return false;
+				}				
+			}
+		}
+		else //without the index of pNalUnit, should process this AU as common case
+		{
+			iIdxNoInterLayerPred = kiEndPos;
+			while (iIdxNoInterLayerPred >= 0)
+			{
+				if ( pCurAu->pNalUnitsList[iIdxNoInterLayerPred]->sNalHeaderExt.iNoInterLayerPredFlag ) 
+				{
+					break;
+				}
+				--iIdxNoInterLayerPred;
+			}
+			if (iIdxNoInterLayerPred < 0) 
+			{
+				return false; //cann't find the index of pNalUnit whose iNoInterLayerPredFlag = 1
+			}
+
+			RefineIdxNoInterLayerPred( pCurAu, &iIdxNoInterLayerPred );	
+			pCurAu->uiStartPos = iIdxNoInterLayerPred;
+			CheckAvailNalUnitsListContinuity( pCtx, iIdxNoInterLayerPred, kiEndPos );
+
+			if ( !CheckPocOfCurValidNalUnits( pCurAu, iIdxNoInterLayerPred ) ) 
+			{
+				return false;
+			}
+		}
+	}	
+
+	return true;	
+}
+
+void_t CheckOnlyOneLayerInAu( PWelsDecoderContext pCtx )
+{
+	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+
+	int32_t iEndIdx = pCurAu->uiEndPos;	
+	int32_t iCurIdx = pCurAu->uiStartPos;
+	uint8_t uiDId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
+	uint8_t uiQId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
+	uint8_t uiTId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
+
+	uint8_t uiCurDId, uiCurQId, uiCurTId;	
+
+	pCtx->bOnlyOneLayerInCurAuFlag = true;
+
+	if ( iEndIdx == iCurIdx ) //only one NAL in pNalUnitsList
+	{
+		return;
+	}
+
+	++iCurIdx;
+	while ( iCurIdx <= iEndIdx )
+	{
+		uiCurDId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiDependencyId;
+		uiCurQId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiQualityId;
+		uiCurTId = pCurAu->pNalUnitsList[iCurIdx]->sNalHeaderExt.uiTemporalId;
+
+		if ( uiDId != uiCurDId || uiQId != uiCurQId || uiTId != uiCurTId )
+		{
+			pCtx->bOnlyOneLayerInCurAuFlag = false;
+			return;
+		}
+
+		++iCurIdx;
+	}
+}
+
+int32_t WelsDecodeAccessUnitStart ( PWelsDecoderContext pCtx )
+{
+	// Roll back NAL units not being belong to current access unit list for proceeded access unit
+	int32_t iRet = UpdateAccessUnit ( pCtx );
+	if ( iRet != ERR_NONE )
+		return iRet;
+
+	pCtx->pAccessUnitList->uiStartPos = 0;
+	if ( !pCtx->bAvcBasedFlag && !CheckIntegrityNalUnitsList( pCtx ) ) 
+	{
+		pCtx->iErrorCode |= dsBitstreamError;
+		return dsBitstreamError;
+	}
+
+	//check current AU has only one layer or not
+	//If YES, can use deblocking based on AVC
+	if ( !pCtx->bAvcBasedFlag )
+	{
+		CheckOnlyOneLayerInAu( pCtx );
+	}
+
+	return ERR_NONE;
+}
+
+void_t WelsDecodeAccessUnitEnd ( PWelsDecoderContext pCtx )
+{
+	// uninitialize context of current access unit and rbsp buffer clean
+	ResetCurrentAccessUnit ( pCtx );	
+}
+
+
+int32_t CheckBSBound(int32_t iWidth, int32_t iHeight, int32_t sliceNum, int32_t ppsId)
+{
+	int32_t iRet = 0;
+	
+#if defined(WIN32)	
+	iRet = ((iWidth == 80) && (iHeight = 45) && (sliceNum < 60));
+	
+#elif defined(MACOS)	
+	iRet = ((iWidth == 80) && (iHeight = 45) && (ppsId < 57));
+	
+#elif defined(ANDROID)
+	iRet = ((iWidth == 40) && (iHeight = 22));
+	
+#endif
+	
+	return iRet;
+	
+}
+
+
+
+/*
+ * ConstructAccessUnit
+ * construct an access unit for given input bitstream, maybe partial NAL Unit, one or more Units are involved to
+ * joint a collective access unit.
+ * parameter\
+ *	buf:		bitstream data buffer
+ *	bit_len:	size in bit length of data
+ *	buf_len:	size in byte length of data
+ *	coded_au:	mark an Access Unit decoding finished
+ * return:
+ *	0 - success; otherwise returned error_no defined in error_no.h
+ */
+int32_t ConstructAccessUnit( PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo *pDstInfo)
+{
+	int32_t iErr;
+	int32_t iWidth;
+	int32_t iHeight;
+	int32_t iStride[2] = { 0 };
+
+	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+
+	pCtx->bAuReadyFlag = false;
+    pCtx->bLastHasMmco5 = false;
+
+	iErr = WelsDecodeAccessUnitStart( pCtx );
+	GetVclNalTemporalId( pCtx );
+	
+	if ( ERR_NONE != iErr )
+	{
+		ForceResetCurrentAccessUnit( pCtx->pAccessUnitList );
+		pDstInfo->iBufferStatus = 0;
+		return iErr;
+	}
+	
+	pCtx->pSps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps;
+	pCtx->pPps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pPps;
+	
+	//try to allocate or relocate DPB memory only when IDR arrival.
+	if ( NAL_UNIT_CODED_SLICE_IDR == pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalHeaderExt.sNalUnitHeader.eNalUnitType ||
+		pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalHeaderExt.bIdrFlag )
+	{
+		WelsResetRefPic(pCtx); //clear ref pPic when IDR NAL
+		iErr = SyncPictureResolutionExt( pCtx, (pCtx->iMaxWidthInSps+15)>>4, (pCtx->iMaxHeightInSps+15)>>4 );		
+
+		if( ERR_NONE != iErr ){
+            WelsLog(pCtx, WELS_LOG_WARNING, "sync picture resolution ext failed,  the error is %d", iErr);
+			return iErr;
+		}		
+	}
+	
+	
+	pDstInfo->eBufferProperty = (EBufferProperty)pCtx->iDecoderOutputProperty;
+	
+	iErr = DecodeCurrentAccessUnit( pCtx, ppDst, iStride, &iWidth, &iHeight, pDstInfo );
+	
+	WelsDecodeAccessUnitEnd( pCtx );
+	
+	if ( ERR_NONE != iErr )
+	{
+		WelsLog( pCtx, WELS_LOG_INFO, "returned error from decoding:[0x%x]\n", iErr);
+		
+		pDstInfo->iBufferStatus = 0;
+		return iErr;
+	}
+	
+	return 0;
+}
+
+static inline void_t InitDqLayerInfo( PDqLayer pDqLayer, PLayerInfo pLayerInfo, PNalUnit pNalUnit, PPicture pPicDec )
+{
+	PNalUnitHeaderExt pNalHdrExt    = &pNalUnit->sNalHeaderExt;
+	PSliceHeaderExt pShExt			= &pNalUnit->sNalData.sVclNal.sSliceHeaderExt;
+	PSliceHeader        pSh			= &pShExt->sSliceHeader;
+	const uint8_t kuiQualityId		= pNalHdrExt->uiQualityId;
+	
+	memcpy(&pDqLayer->sLayerInfo, pLayerInfo, sizeof(SLayerInfo));//confirmed_safe_unsafe_usage
+	
+	pDqLayer->pDec		= pPicDec;
+	pDqLayer->iMbWidth	= pSh->iMbWidth;	// MB width of this picture
+	pDqLayer->iMbHeight	= pSh->iMbHeight;// MB height of this picture			
+
+	pDqLayer->iSliceIdcBackup = (pSh->iFirstMbInSlice << 7) | (pNalHdrExt->uiDependencyId << 4) | (pNalHdrExt->uiQualityId);
+	
+	/* Common syntax elements across all slices of a DQLayer */			
+	pDqLayer->uiPpsId									= pLayerInfo->pPps->iPpsId;
+	pDqLayer->uiDisableInterLayerDeblockingFilterIdc	= pShExt->uiDisableInterLayerDeblockingFilterIdc;
+	pDqLayer->iInterLayerSliceAlphaC0Offset			    = pShExt->iInterLayerSliceAlphaC0Offset;
+	pDqLayer->iInterLayerSliceBetaOffset				= pShExt->iInterLayerSliceBetaOffset;	
+	pDqLayer->iSliceGroupChangeCycle					= pSh->iSliceGroupChangeCycle;
+	pDqLayer->bStoreRefBasePicFlag					    = pShExt->bStoreRefBasePicFlag;
+	pDqLayer->bTCoeffLevelPredFlag					    = pShExt->bTCoeffLevelPredFlag;
+	pDqLayer->bConstrainedIntraResamplingFlag			= pShExt->bConstrainedIntraResamplingFlag;
+	pDqLayer->uiRefLayerDqId							= pShExt->uiRefLayerDqId;
+	pDqLayer->uiRefLayerChromaPhaseXPlus1Flag		    = pShExt->uiRefLayerChromaPhaseXPlus1Flag;
+	pDqLayer->uiRefLayerChromaPhaseYPlus1				= pShExt->uiRefLayerChromaPhaseYPlus1;
+	//memcpy(&pDqLayer->sScaledRefLayer, &pShExt->sScaledRefLayer, sizeof(SPosOffset));//confirmed_safe_unsafe_usage
+	
+	if ( kuiQualityId == BASE_QUALITY_ID ){
+		pDqLayer->pRefPicListReordering		= &pSh->pRefPicListReordering;
+		pDqLayer->pRefPicMarking		= &pSh->sRefMarking;
+		pDqLayer->pRefPicBaseMarking	= &pShExt->sRefBasePicMarking;
+	}	
+	
+	pDqLayer->uiLayerDqId			= pNalHdrExt->uiLayerDqId;	// dq_id of current layer
+	pDqLayer->bUseRefBasePicFlag	= pNalHdrExt->bUseRefBasePicFlag;
+}
+
+void_t WelsDqLayerDecodeStart ( PWelsDecoderContext pCtx, PNalUnit pCurNal, PSps pSps, PPps pPps )
+{		
+	SNalUnitHeader *pNalHdr = &pCurNal->sNalHeaderExt.sNalUnitHeader;
+	PSliceHeader pSh = &pCurNal->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+			
+	pCtx->eSliceType			= pSh->eSliceType;	
+	pCtx->pSliceHeader			= pSh;
+
+	pCtx->iFrameNum			= pSh->iFrameNum;
+
+	if ((pNalHdr->eNalUnitType == NAL_UNIT_CODED_SLICE_IDR || 
+		(pCurNal->sNalHeaderExt.uiQualityId == BASE_QUALITY_ID && pCurNal->sNalHeaderExt.bIdrFlag)) &&
+		 pSh->iFrameNum == 0) //pSh->iFrameNum == 0 this condition can deleted??????
+	{
+		WelsResetRefPic ( pCtx );	// Reset decoded picture buffer lists due to an IDR frame incomes
+	}
+}
+
+int32_t InitRefPicList ( PWelsDecoderContext pCtx, const uint8_t kuiNRi, const bool_t kbFirstSlice, int32_t iPoc)
+{
+	int32_t iRet = ERR_NONE;
+    if( kbFirstSlice)
+        iRet = WelsInitRefList( pCtx, iPoc );
+	if ( (pCtx->eSliceType!=I_SLICE && pCtx->eSliceType!=SI_SLICE) && kbFirstSlice ){
+		iRet = WelsReorderRefList ( pCtx );
+	}
+	
+	return iRet;
+}
+
+void_t InitCurDqLayerData( PWelsDecoderContext pCtx, PDqLayer pCurDq )
+{
+	if ( NULL != pCtx && NULL != pCurDq )
+	{
+		pCurDq->pCsData[0]		= pCtx->pCsListXchg[0][0];
+		pCurDq->pCsData[1]		= pCtx->pCsListXchg[0][1];
+		pCurDq->pCsData[2]		= pCtx->pCsListXchg[0][2];
+		pCurDq->iCsStride[0]	= pCtx->iCsStride[0];
+		pCurDq->iCsStride[1]	= pCtx->iCsStride[1];
+		pCurDq->iCsStride[2]	= pCtx->iCsStride[2];
+	
+		pCurDq->pMbType			= pCtx->sMb.pMbType[0];
+		pCurDq->pSliceIdc		= pCtx->sMb.pSliceIdc[0];
+		pCurDq->pMv[0]			= pCtx->sMb.pMv[0][0];
+		pCurDq->pRefIndex[0]    = pCtx->sMb.pRefIndex[0][0];
+		pCurDq->pLumaQp         = pCtx->sMb.pLumaQp[0];
+		pCurDq->pChromaQp       = pCtx->sMb.pChromaQp[0];
+		pCurDq->pNzc			= pCtx->sMb.pNzc[0];
+		pCurDq->pNzcRs			= pCtx->sMb.pNzcRs[0];
+		pCurDq->pScaledTCoeff   = pCtx->sMb.pScaledTCoeff[0];
+		pCurDq->pIntraPredMode  = pCtx->sMb.pIntraPredMode[0];
+		pCurDq->pIntra4x4FinalMode = pCtx->sMb.pIntra4x4FinalMode[0];
+		pCurDq->pChromaPredMode = pCtx->sMb.pChromaPredMode[0];
+		pCurDq->pCbp            = pCtx->sMb.pCbp[0];
+		pCurDq->pSubMbType      = pCtx->sMb.pSubMbType[0];
+		pCurDq->pInterPredictionDoneFlag = pCtx->sMb.pInterPredictionDoneFlag[0];
+		pCurDq->pResidualPredFlag = pCtx->sMb.pResidualPredFlag[0];
+	}
+}
+
+// added to reset state of parameter sets to waiting successive incoming IDR, 6/4/2010
+// It will be called in case packets lost/ broken and decoded failed at temporal level 0
+void_t ResetParameterSetsState( PWelsDecoderContext pCtx )
+{
+	pCtx->bSpsExistAheadFlag	   = false;
+	pCtx->bSubspsExistAheadFlag = false;
+	pCtx->bPpsExistAheadFlag	   = false;
+}
+
+/*
+ * DecodeCurrentAccessUnit
+ * Decode current access unit when current AU is completed.
+ */
+int32_t DecodeCurrentAccessUnit( PWelsDecoderContext pCtx, uint8_t **ppDst, int32_t *pDstLen, int32_t *pWidth, int32_t *pHeight, SBufferInfo *pDstInfo )
+{	
+	int32_t iRefCount[LIST_A];	
+	PNalUnit pNalCur = NULL;
+	PAccessUnit pCurAu = pCtx->pAccessUnitList;
+
+	int32_t iIdx = pCurAu->uiStartPos;
+	int32_t iEndIdx = pCurAu->uiEndPos;
+	
+	int32_t iPpsId = 0;
+	int32_t iRet = ERR_NONE;
+
+	const uint8_t kuiTargetLayerDqId = GetTargetDqId(pCtx->uiTargetDqId, pCtx->pParam); 
+	const uint8_t kuiDependencyIdMax = (kuiTargetLayerDqId & 0x7F) >> 4;
+	int16_t iLastIdD = -1, iLastIdQ = -1;
+	int16_t iCurrIdD = 0, iCurrIdQ = 0;
+	uint8_t uiNalRefIdc = 0;
+	bool_t	bFreshSliceAvailable = true;	// Another fresh slice comingup for given dq layer, for multiple slices in case of header parts of slices sometimes loss over error-prone channels, 8/14/2008
+	PPicture  pStoreBasePic = NULL;	
+
+	//update pCurDqLayer at the starting of AU decoding
+	if ( pCtx->bInitialDqLayersMem )
+	{		
+		pCtx->pCurDqLayer				= pCtx->pDqLayersList[0];
+	}
+
+	InitCurDqLayerData( pCtx, pCtx->pCurDqLayer );
+
+	pNalCur = pCurAu->pNalUnitsList[iIdx];	
+	while ( iIdx <= iEndIdx )
+	{
+		PDqLayer dq_cur							= pCtx->pCurDqLayer;
+		SLayerInfo pLayerInfo;
+		PSliceHeaderExt pShExt					= NULL;
+		PSliceHeader pSh							= NULL;		
+	
+		if( pCtx->pDec == NULL ){
+			pCtx->pDec = PrefetchPic(pCtx->pPicBuff[0]);
+
+			if( NULL == pCtx->pDec ){
+				WelsLog( pCtx, WELS_LOG_ERROR, "DecodeCurrentAccessUnit()::::::PrefetchPic ERROR, pSps->iNumRefFrames:%d.\n", 
+					pCtx->pSps->iNumRefFrames );
+				pCtx->iErrorCode |= dsOutOfMemory;
+				return ERR_INFO_REF_COUNT_OVERFLOW;
+			}
+		}
+
+#ifdef NO_WAITING_AU
+		//For fixing the nal lossing issue
+		if ((pCtx->pDec->iTotalNumMbRec != 0)&&
+			(CheckAccessUnitBoundaryExt(&pCtx->sLastNalHdrExt, &pNalCur->sNalHeaderExt, &pCtx->sLastSliceHeader,  &pNalCur->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader)))
+		{
+             pCtx->pDec->iTotalNumMbRec = 0;	
+        }
+#else
+		//initialize at the starting of AU.
+		pCtx->pDec->iTotalNumMbRec = 0;			
+#endif
+        if(pCtx->pDec->iTotalNumMbRec == 0) //Picture start to decode
+        {
+            for( int32_t i = 0; i < LAYER_NUM_EXCHANGEABLE; ++ i)
+                memset(pCtx->sMb.pSliceIdc[i], 0xff, (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof(int32_t)) );
+        }
+		GetI4LumaIChromaAddrTable(pCtx->iDecBlockOffsetArray, pCtx->pDec->iLinesize[0], pCtx->pDec->iLinesize[1]);
+
+		if ( pNalCur->sNalHeaderExt.uiLayerDqId > kuiTargetLayerDqId ) {
+			break;	// Per formance it need not to decode the remaining bits any more due to given uiLayerDqId required, 9/2/2009
+		}
+
+		memset(&pLayerInfo, 0, sizeof(SLayerInfo));
+		
+		/*
+		 *	Loop decoding for slices (even FMO and/ multiple slices) within a dq layer
+		 */
+		while ( iIdx <= iEndIdx )
+		{	
+		    BOOL_T         bReconstructSlice;
+			iCurrIdQ	= pNalCur->sNalHeaderExt.uiQualityId;
+			iCurrIdD	= pNalCur->sNalHeaderExt.uiDependencyId;
+			pSh		= &pNalCur->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+			pShExt	= &pNalCur->sNalData.sVclNal.sSliceHeaderExt;
+
+			bReconstructSlice = CheckSliceNeedReconstruct(iCurrIdD, iCurrIdQ, pShExt->bStoreRefBasePicFlag, 
+				kuiDependencyIdMax, pNalCur->sNalHeaderExt.uiLayerDqId, kuiTargetLayerDqId);
+
+			memcpy(&pLayerInfo.sNalHeaderExt, &pNalCur->sNalHeaderExt, sizeof(SNalUnitHeaderExt));//confirmed_safe_unsafe_usage
+
+			pCtx->pDec->iFrameNum = pSh->iFrameNum;		
+
+			memcpy(&pLayerInfo.sSliceInLayer.sSliceHeaderExt, pShExt, sizeof(SSliceHeaderExt));//confirmed_safe_unsafe_usage
+			pLayerInfo.sSliceInLayer.bSliceHeaderExtFlag	= pNalCur->sNalData.sVclNal.bSliceHeaderExtFlag;
+			pLayerInfo.sSliceInLayer.eSliceType			= pSh->eSliceType;
+			pLayerInfo.sSliceInLayer.iLastMbQp			= pSh->iSliceQp;
+				dq_cur->pBitStringAux	= &pNalCur->sNalData.sVclNal.sSliceBitsRead;
+			
+			uiNalRefIdc	= pNalCur->sNalHeaderExt.sNalUnitHeader.uiNalRefIdc;	
+
+			iPpsId	= pSh->iPpsId;
+
+			pLayerInfo.pPps = pSh->pPps;
+			pLayerInfo.pSps = pSh->pSps;
+			pLayerInfo.pSubsetSps = pShExt->pSubsetSps;				
+
+			pCtx->pFmo = &pCtx->sFmoList[iPpsId];
+			if ( !FmoParamUpdate( pCtx->pFmo, pLayerInfo.pSps, pLayerInfo.pPps, &pCtx->iActiveFmoNum ) ) {
+				pCtx->iErrorCode |= dsBitstreamError;
+				WelsLog( pCtx, WELS_LOG_WARNING, "DecodeCurrentAccessUnit(), FmoParamUpdate failed, eSliceType: %d.\n", pSh->eSliceType);
+				return GENERATE_ERROR_NO(ERR_LEVEL_SLICE_HEADER, ERR_INFO_FMO_INIT_FAIL);
+			}
+
+			bFreshSliceAvailable	= (iCurrIdD != iLastIdD || iCurrIdQ != iLastIdQ);	// do not need condition of (first_mb == 0) due multiple slices might be disorder
+			
+			WelsDqLayerDecodeStart ( pCtx, pNalCur, pLayerInfo.pSps, pLayerInfo.pPps );
+
+			if ( iCurrIdQ == BASE_QUALITY_ID )
+			{
+				ST64(iRefCount, LD64(pLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiRefCount));
+			}
+			
+			if ( (iLastIdD < 0) || //case 1: first layer
+				( iLastIdD == iCurrIdD) ) //case 2: same uiDId
+			{
+				InitDqLayerInfo( dq_cur, &pLayerInfo, pNalCur, pCtx->pDec );
+
+				if ( !dq_cur->sLayerInfo.pSps->bGapsInFrameNumValueAllowedFlag )
+				{
+					const bool_t kbIdrFlag = dq_cur->sLayerInfo.sNalHeaderExt.bIdrFlag || (dq_cur->sLayerInfo.sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR);
+					// Subclause 8.2.5.2 Decoding process for gaps in frame_num
+					if (	!kbIdrFlag  && 
+						pSh->iFrameNum != pCtx->iPrevFrameNum &&
+						pSh->iFrameNum != ((pCtx->iPrevFrameNum+1) & ((1<<dq_cur->sLayerInfo.pSps->uiLog2MaxFrameNum)-1))	)
+					{
+						WelsLog( pCtx, WELS_LOG_WARNING, "referencing pictures lost due frame gaps exist, prev_frame_num: %d, curr_frame_num: %d\n", pCtx->iPrevFrameNum, pSh->iFrameNum);
+
+#ifdef LONG_TERM_REF
+						pCtx->bParamSetsLostFlag = true;
+#else
+						pCtx->bReferenceLostAtT0Flag = true;
+#endif
+						ResetParameterSetsState( pCtx );				
+
+						pCtx->iErrorCode |= dsRefLost;
+						return ERR_INFO_REFERENCE_PIC_LOST;
+					}
+				}
+
+				if ( iCurrIdD == kuiDependencyIdMax && iCurrIdQ == BASE_QUALITY_ID )
+				{
+					iRet = InitRefPicList ( pCtx, uiNalRefIdc, bFreshSliceAvailable, pSh->iPicOrderCntLsb);
+					if ( iRet )
+					{
+						HandleReferenceLost(pCtx, pNalCur);
+						WelsLog( pCtx, WELS_LOG_WARNING, "reference picture introduced by this frame is lost during transmission! uiTId: %d\n", pNalCur->sNalHeaderExt.uiTemporalId );
+						return iRet;
+					}
+				}
+
+				iRet = WelsDecodeSlice ( pCtx, bFreshSliceAvailable, pNalCur );
+
+				//Output good store_base reconstruction when enhancement quality layer occurred error for MGS key picture case
+				if ( iRet != ERR_NONE )
+				{
+					WelsLog( pCtx, WELS_LOG_WARNING, "DecodeCurrentAccessUnit() failed (%d) in frame: %d uiDId: %d uiQId: %d\n",
+						iRet, pSh->iFrameNum, iCurrIdD, iCurrIdQ);
+					HandleReferenceLostL0(pCtx, pNalCur);
+					return iRet;
+				}
+				if( bReconstructSlice )	{					
+					if( WelsDecodeConstructSlice(pCtx, pNalCur) ){
+						return -1;
+					}
+				}				
+			}
+#if defined (_DEBUG) &&  !defined (CODEC_FOR_TESTBED)
+			fprintf( stderr, "cur_frame : %d	iCurrIdD : %d\n ", 
+				dq_cur->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iFrameNum, iCurrIdD );
+#endif//#if !CODEC_FOR_TESTBED
+			iLastIdD	= iCurrIdD;
+			iLastIdQ	= iCurrIdQ;		
+	
+			//pNalUnitsList overflow.
+			++ iIdx;
+			if (iIdx <= iEndIdx)
+			{				
+				pNalCur	= pCurAu->pNalUnitsList[iIdx];
+			}
+			else
+			{
+				pNalCur	= NULL;
+			}
+
+			if ( pNalCur == NULL ||
+				iLastIdD != pNalCur->sNalHeaderExt.uiDependencyId || 
+				iLastIdQ != pNalCur->sNalHeaderExt.uiQualityId )
+				break;
+		} 
+
+		// A dq layer decoded here
+#if defined (_DEBUG) &&  !defined (CODEC_FOR_TESTBED)
+#undef fprintf
+		fprintf(stderr, "POC: #%d, FRAME: #%d, D: %d, Q: %d, T: %d, P: %d,	%d\n",
+			pSh->iPicOrderCntLsb, pSh->iFrameNum, iCurrIdD, iCurrIdQ, dq_cur->sLayerInfo.sNalHeaderExt.uiTemporalId, dq_cur->sLayerInfo.sNalHeaderExt.uiPriorityId,dq_cur->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iSliceQp);
+#endif//#if !CODEC_FOR_TESTBED
+
+        if( dq_cur->uiLayerDqId == kuiTargetLayerDqId ){
+		    if( DecodeFrameConstruction( pCtx, ppDst, pDstLen, pWidth, pHeight, pDstInfo) ){
+#ifdef NO_WAITING_AU
+                memcpy(&pCtx->sLastNalHdrExt, &pCurAu->pNalUnitsList[iIdx-1]->sNalHeaderExt, sizeof(SNalUnitHeaderExt));
+                memcpy(&pCtx->sLastSliceHeader, &pCurAu->pNalUnitsList[iIdx-1]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader, sizeof(SSliceHeader));
+				return ERR_NONE;
+#else
+				pCtx->iErrorCode |= dsBitstreamError;
+				return -1;
+#endif
+				
+		    }
+			if( (uiNalRefIdc > 0) && ( iCurrIdQ || (!dq_cur->bStoreRefBasePicFlag) ) ){
+				WelsMarkAsRef(pCtx, false);
+                ExpandReferencingPicture(pCtx->pDec, pCtx->sExpandPicFunc.pExpandLumaPicture, pCtx->sExpandPicFunc.pExpandChromaPicture);
+				pCtx->pDec = NULL;
+			}
+        }
+
+		if( (iCurrIdD == kuiDependencyIdMax) && (iCurrIdQ == BASE_QUALITY_ID) && (dq_cur->bStoreRefBasePicFlag) ){
+			pStoreBasePic = pCtx->pDec;
+
+			if( uiNalRefIdc > 0 ){
+				WelsMarkAsRef(pCtx, true);
+                ExpandReferencingPicture(pCtx->pDec, pCtx->sExpandPicFunc.pExpandLumaPicture, pCtx->sExpandPicFunc.pExpandChromaPicture);
+				pCtx->pDec = NULL;
+			}
+		}		
+		// need update frame_num due current frame is well decoded
+		pCtx->iPrevFrameNum	= pSh->iFrameNum;
+        if( pCtx->bLastHasMmco5 )   
+            pCtx->iPrevFrameNum = 0;
+	} 
+
+	return ERR_NONE;
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/decoder_data_tables.cpp
@@ -1,0 +1,661 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// exp_data.c
+// export date cross various modules (.c)
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "mb_cache.h"
+#include "utils.h"
+#include "vlc_decoder.h"
+
+namespace WelsDec {
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////non_zero_count[16+8] mapping scan index
+const uint8_t g_kuiMbNonZeroCountIdx[24] =
+{                     //  0   1 | 4  5      luma 8*8 block           non_zero_count[16+8] 
+	0,  1,  4,  5,   //  2   3 | 6  7        0  |  1                  0   1   2   3 
+	2,  3,  6,  7,   //---------------      ---------                 4   5   6   7 
+	8,  9, 12, 13,   //  8   9 | 12 13       2  |  3                  8   9  10  11 
+	10, 11, 14, 15,   // 10  11 | 14 15-----------------------------> 12  13  14  15 
+	16, 17, 20, 21,   //----------------    chroma 8*8 block          16  17  18  19  
+	18, 19, 22, 23   // 16  17 | 20 21        0    1                 20  21  22  23 
+};
+//cache element equal to 26
+
+const uint8_t g_kuiCacheNzcScanIdx[24] = 
+{
+	/* Luma */
+	9, 10, 17, 18,	// 1+1*8, 2+1*8, 1+2*8, 2+2*8,
+	11, 12, 19, 20,	// 3+1*8, 4+1*8, 3+2*8, 4+2*8,
+	25, 26, 33, 34,	// 1+3*8, 2+3*8, 1+4*8, 2+4*8,
+	27, 28, 35, 36,	// 3+3*8, 4+3*8, 3+4*8, 4+4*8,
+    /* Cb */
+	14, 15,			// 6+1*8, 7+1*8,
+	22, 23,			// 6+2*8, 7+2*8,
+
+    /* Cr */
+	38, 39,			// 6+4*8, 7+4*8,
+	46, 47,			// 6+5*8, 7+5*8,
+};
+
+//cache element equal to 30
+const uint8_t g_kuiCache30ScanIdx[16] = //mv or ref_index cache scan index, 4*4 block as basic unit
+{
+	7,  8, 13, 14,
+	9, 10, 15, 16,
+	19, 20, 25, 26,
+	21, 22, 27, 28
+};
+
+const uint8_t g_kuiScan4[16] = //for mb cache in sMb (only current element, without neighbor) 
+{                         // 4*4block scan    mb cache order
+	0,  1,  4,  5,        // 0  1 | 4  5      0  1 | 2  3
+	2,  3,  6,  7,        // 2  3 | 6  7      4  5 | 6  7
+	8,  9, 12, 13,        //----------------->----------- 
+	10, 11, 14, 15        // 8  9 |12 13      8  9 |10 11
+};                        //10 11 |14 15     12 13 |14 15 
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at wels_common_basis.h
+
+const uint8_t g_kuiChromaQp[52]={
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
+	12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
+	28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37,
+	37,38,38,38,39,39,39,39
+};
+
+/*
+ *	vcl type map for given NAL unit type and corresponding H264 type
+ */
+const VclType g_kuiVclTypeMap[32][2] =   
+{
+	{ NON_VCL,	NON_VCL },	// 0: NAL_UNIT_UNSPEC_0
+	{ VCL,		VCL,	},	// 1: NAL_UNIT_CODED_SLICE
+	{ VCL,		NOT_APP },	// 2: NAL_UNIT_CODED_SLICE_DPA
+	{ VCL,		NOT_APP },	// 3: NAL_UNIT_CODED_SLICE_DPB
+	{ VCL,		NOT_APP },	// 4: NAL_UNIT_CODED_SLICE_DPC
+	{ VCL,		VCL		},	// 5: NAL_UNIT_CODED_SLICE_IDR
+	{ NON_VCL,	NON_VCL },	// 6: NAL_UNIT_SEI
+	{ NON_VCL,	NON_VCL },	// 7: NAL_UNIT_SPS
+	{ NON_VCL,	NON_VCL },	// 8: NAL_UNIT_PPS
+	{ NON_VCL,	NON_VCL },	// 9: NAL_UNIT_AU_DELIMITER
+	{ NON_VCL,	NON_VCL },	// 10: NAL_UNIT_END_OF_SEQ
+	{ NON_VCL,	NON_VCL },	// 11: NAL_UNIT_END_OF_STR
+	{ NON_VCL,	NON_VCL	},	// 12: NAL_UNIT_FILLER_DATA
+	{ NON_VCL,	NON_VCL },	// 13: NAL_UNIT_SPS_EXT
+	{ NON_VCL,	NON_VCL },	// 14: NAL_UNIT_PREFIX, NEED associate succeeded NAL to make a VCL
+	{ NON_VCL,	NON_VCL },	// 15: NAL_UNIT_SUBSET_SPS
+	{ NON_VCL,	NON_VCL },	// 16: NAL_UNIT_RESV_16
+	{ NON_VCL,	NON_VCL },	// 17: NAL_UNIT_RESV_17
+	{ NON_VCL,	NON_VCL },	// 18: NAL_UNIT_RESV_18
+	{ NON_VCL,	NON_VCL },	// 19: NAL_UNIT_AUX_CODED_SLICE
+	{ NON_VCL,	VCL		},	// 20: NAL_UNIT_CODED_SLICE_EXT
+	{ NON_VCL,	NON_VCL },	// 21: NAL_UNIT_RESV_21
+	{ NON_VCL,	NON_VCL },	// 22: NAL_UNIT_RESV_22
+	{ NON_VCL,	NON_VCL },	// 23: NAL_UNIT_RESV_23
+	{ NON_VCL,	NON_VCL },	// 24: NAL_UNIT_UNSPEC_24
+	{ NON_VCL,	NON_VCL },	// 25: NAL_UNIT_UNSPEC_25
+	{ NON_VCL,	NON_VCL },	// 26: NAL_UNIT_UNSPEC_26
+	{ NON_VCL,	NON_VCL	},	// 27: NAL_UNIT_UNSPEC_27
+	{ NON_VCL,	NON_VCL },	// 28: NAL_UNIT_UNSPEC_28
+	{ NON_VCL,	NON_VCL },	// 29: NAL_UNIT_UNSPEC_29
+	{ NON_VCL,	NON_VCL },	// 30: NAL_UNIT_UNSPEC_30
+	{ NON_VCL,	NON_VCL }	// 31: NAL_UNIT_UNSPEC_31
+};
+
+/*common use table*/
+const uint8_t g_kuiScan8[24]={	// [16 + 2*4]
+    9, 10, 17, 18,	// 1+1*8, 2+1*8, 1+2*8, 2+2*8,
+	11, 12, 19, 20,	// 3+1*8, 4+1*8, 3+2*8, 4+2*8,
+	25, 26, 33, 34,	// 1+3*8, 2+3*8, 1+4*8, 2+4*8,
+	27, 28, 35, 36,	// 3+3*8, 4+3*8, 3+4*8, 4+4*8,
+	14, 15,			// 6+1*8, 7+1*8,
+	22, 23,			// 6+2*8, 7+2*8,
+	38, 39,			// 6+4*8, 7+4*8,
+	46, 47,			// 6+5*8, 7+5*8,
+};
+
+const uint8_t g_kuiLumaDcZigzagScan[16]={
+	0, 16, 32, 128,			// 0*16 + 0*64, 1*16 + 0*64, 2*16 + 0*64, 0*16 + 2*64,
+	48, 64, 80, 96,			// 3*16 + 0*64, 0*16 + 1*64, 1*16 + 1*64, 2*16 + 1*64,
+	144, 160, 176, 192,		// 1*16 + 2*64, 2*16 + 2*64, 3*16 + 2*64, 0*16 + 3*64,
+	112, 208, 224, 240		// 3*16 + 1*64, 1*16 + 3*64, 2*16 + 3*64, 3*16 + 3*64,
+};
+
+const uint8_t g_kuiChromaDcScan[4]={
+	0, 16, 32, 48
+};
+
+__align16( const uint16_t, g_kuiDequantCoeff[52][8]) = {
+	/* 0*/{   10,   13,   10,   13,   13,   16,   13,   16 },	/* 1*/{   11,   14,   11,   14,   14,   18,   14,   18 },
+	/* 2*/{   13,   16,   13,   16,   16,   20,   16,   20 },	/* 3*/{   14,   18,   14,   18,   18,   23,   18,   23 },
+	/* 4*/{   16,   20,   16,   20,   20,   25,   20,   25 },	/* 5*/{   18,   23,   18,   23,   23,   29,   23,   29 },
+	/* 6*/{   20,   26,   20,   26,   26,   32,   26,   32 },	/* 7*/{   22,   28,   22,   28,   28,   36,   28,   36 },
+	/* 8*/{   26,   32,   26,   32,   32,   40,   32,   40 },	/* 9*/{   28,   36,   28,   36,   36,   46,   36,   46 },
+	/*10*/{   32,   40,   32,   40,   40,   50,   40,   50 },	/*11*/{   36,   46,   36,   46,   46,   58,   46,   58 },
+	/*12*/{   40,   52,   40,   52,   52,   64,   52,   64 },	/*13*/{   44,   56,   44,   56,   56,   72,   56,   72 },
+	/*14*/{   52,   64,   52,   64,   64,   80,   64,   80 },	/*15*/{   56,   72,   56,   72,   72,   92,   72,   92 },
+	/*16*/{   64,   80,   64,   80,   80,  100,   80,  100 },	/*17*/{   72,   92,   72,   92,   92,  116,   92,  116 },
+	/*18*/{   80,  104,   80,  104,  104,  128,  104,  128 },	/*19*/{   88,  112,   88,  112,  112,  144,  112,  144 },
+	/*20*/{  104,  128,  104,  128,  128,  160,  128,  160 },	/*21*/{  112,  144,  112,  144,  144,  184,  144,  184 },
+	/*22*/{  128,  160,  128,  160,  160,  200,  160,  200 },	/*23*/{  144,  184,  144,  184,  184,  232,  184,  232 },
+	/*24*/{  160,  208,  160,  208,  208,  256,  208,  256 },	/*25*/{  176,  224,  176,  224,  224,  288,  224,  288 },
+	/*26*/{  208,  256,  208,  256,  256,  320,  256,  320 },	/*27*/{  224,  288,  224,  288,  288,  368,  288,  368 },
+	/*28*/{  256,  320,  256,  320,  320,  400,  320,  400 },	/*29*/{  288,  368,  288,  368,  368,  464,  368,  464 },
+	/*30*/{  320,  416,  320,  416,  416,  512,  416,  512 },	/*31*/{  352,  448,  352,  448,  448,  576,  448,  576 },
+	/*32*/{  416,  512,  416,  512,  512,  640,  512,  640 },	/*33*/{  448,  576,  448,  576,  576,  736,  576,  736 },
+	/*34*/{  512,  640,  512,  640,  640,  800,  640,  800 },	/*35*/{  576,  736,  576,  736,  736,  928,  736,  928 },
+	/*36*/{  640,  832,  640,  832,  832, 1024,  832, 1024 },	/*37*/{  704,  896,  704,  896,  896, 1152,  896, 1152 },
+	/*38*/{  832, 1024,  832, 1024, 1024, 1280, 1024, 1280 },	/*39*/{  896, 1152,  896, 1152, 1152, 1472, 1152, 1472 },
+	/*40*/{ 1024, 1280, 1024, 1280, 1280, 1600, 1280, 1600 },	/*41*/{ 1152, 1472, 1152, 1472, 1472, 1856, 1472, 1856 },
+	/*42*/{ 1280, 1664, 1280, 1664, 1664, 2048, 1664, 2048 },	/*43*/{ 1408, 1792, 1408, 1792, 1792, 2304, 1792, 2304 },
+	/*44*/{ 1664, 2048, 1664, 2048, 2048, 2560, 2048, 2560 },	/*45*/{ 1792, 2304, 1792, 2304, 2304, 2944, 2304, 2944 },
+	/*46*/{ 2048, 2560, 2048, 2560, 2560, 3200, 2560, 3200 },	/*47*/{ 2304, 2944, 2304, 2944, 2944, 3712, 2944, 3712 },
+	/*48*/{ 2560, 3328, 2560, 3328, 3328, 4096, 3328, 4096 },	/*49*/{ 2816, 3584, 2816, 3584, 3584, 4608, 3584, 4608 },
+	/*50*/{ 3328, 4096, 3328, 4096, 4096, 5120, 4096, 5120 },	/*51*/{ 3584, 4608, 3584, 4608, 4608, 5888, 4608, 5888 },
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+const uint8_t g_kuiIntra4x4CbpTable[48] =
+{
+	47, 31, 15,  0, 23, 27, 29, 30,  7, 11, 13, 14, 39, 43, 45, 46, //15
+	16,  3,  5, 10, 12, 19, 21, 26, 28, 35, 37, 42, 44,  1,  2,  4, //31
+	8, 17, 18, 20, 24,  6,  9, 22, 25, 32, 33, 34, 36, 40, 38, 41  //47 
+};
+
+const uint8_t g_kuiInterCbpTable[48] =
+{
+	0, 16,  1,  2,  4,  8, 32,  3,  5, 10, 12, 15, 47,  7, 11, 13, //15
+	14,  6,  9, 31, 35, 37, 42, 44, 33, 34, 36, 40, 39, 43, 45, 46, //31
+	17, 18, 20, 24, 19, 21, 26, 28, 23, 27, 29, 30, 22, 25, 38, 41  //47 
+};
+
+const uint8_t g_kuiLeadingZeroTable[256] = 
+{
+	8,  7,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4,
+	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at vlc_decoder.h
+
+const uint8_t g_kuiVlcChromaTable[256][2] =
+{
+	{13, 7}, {13, 7}, {12, 8}, {11, 8}, {8, 7}, {8, 7}, {7, 7}, {7, 7}, {10, 6}, {10, 6}, {10, 6}, {10, 6}, {6, 6}, {6, 6}, {6, 6}, {6, 6}, //15
+	{ 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, {9, 6}, {9, 6}, {9, 6}, {9, 6}, { 4, 6}, { 4, 6}, { 4, 6}, { 4, 6}, {1, 6}, {1, 6}, {1, 6}, {1, 6}, //31
+	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, //47
+	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, //63
+	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //79
+	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //95
+	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //111
+	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, //127
+	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //143
+	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //159
+	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //175
+	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //191
+	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //207
+	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //223
+	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, //239
+	{ 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1}, {2, 1}, {2, 1}, {2, 1}, {2, 1}  //255 
+};
+
+const uint8_t g_kuiVlcTable_0[256][2] = //[0] means the index of vlc table, [1] means the length of vlc code  [256] value means the value of 8bits  
+{
+	{ 0, 0}, { 0, 0}, { 0, 0}, {0, 0}, {21, 8}, {12, 8}, {7, 8}, {3, 8}, {17, 7}, {17, 7}, {8, 7}, {8, 7}, {13, 6}, {13, 6}, {13, 6}, {13, 6}, //15
+	{ 4, 6}, { 4, 6}, { 4, 6}, {4, 6}, { 1, 6}, { 1, 6}, {1, 6}, {1, 6}, { 9, 5}, { 9, 5}, {9, 5}, {9, 5}, { 9, 5}, { 9, 5}, { 9, 5}, { 9, 5}, //31
+	{ 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //47
+	{ 5, 3}, { 5, 3}, { 5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, {5, 3}, {5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //63
+	{ 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //79
+	{ 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //95
+	{ 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //111
+	{ 2, 2}, { 2, 2}, { 2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, {2, 2}, {2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //127
+	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //143
+	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //159
+	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //175
+	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //191
+	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //207
+	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //223
+	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, //239
+	{ 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1} //255  
+};
+
+const uint8_t g_kuiVlcTable_0_0[256][2] = // read 8 bits  // for g_kuiVlcTable_0[0] //checked no error--
+{
+	{ 0, 0}, { 0, 0}, {47, 7}, {47, 7}, {58, 8}, {60, 8}, {59, 8}, {54, 8}, {61, 8}, {56, 8}, {55, 8}, {50, 8}, {57, 8}, {52, 8}, {51, 8}, {46, 8}, //15
+	{53, 7}, {53, 7}, {48, 7}, {48, 7}, {43, 7}, {43, 7}, {42, 7}, {42, 7}, {49, 7}, {49, 7}, {44, 7}, {44, 7}, {39, 7}, {39, 7}, {38, 7}, {38, 7}, //31
+	{45, 6}, {45, 6}, {45, 6}, {45, 6}, {40, 6}, {40, 6}, {40, 6}, {40, 6}, {35, 6}, {35, 6}, {35, 6}, {35, 6}, {34, 6}, {34, 6}, {34, 6}, {34, 6}, //47
+	{41, 6}, {41, 6}, {41, 6}, {41, 6}, {36, 6}, {36, 6}, {36, 6}, {36, 6}, {31, 6}, {31, 6}, {31, 6}, {31, 6}, {30, 6}, {30, 6}, {30, 6}, {30, 6}, //63
+	{26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {26, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, {32, 5}, //79
+	{27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {27, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, {22, 5}, //95
+	{37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {37, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, {28, 5}, //111
+	{23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {23, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, {18, 5}, //127
+	{33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, //143
+	{33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, {33, 3}, //159
+	{24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, //175
+	{24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, {24, 3}, //191
+	{19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, //207
+	{19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, {19, 3}, //223
+	{14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, //239
+	{14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3}, {14, 3} //255		
+};
+
+const uint8_t g_kuiVlcTable_0_1[4][2] = // read 2 bits // for g_kuiVlcTable_0[1] //checked no error--
+{
+	{29, 2}, {20, 2}, {15, 2}, {10, 2}	
+};
+
+const uint8_t g_kuiVlcTable_0_2[2][2] = // read 1 bit // for g_kuiVlcTable_0[2] //checked no error--
+{
+	{25, 1}, {16, 1}	
+};
+
+const uint8_t g_kuiVlcTable_0_3[2][2] = // read 1 bit // for g_kuiVlcTable_0[3] //checked no error--
+{
+	{11, 1}, {6, 1}	
+};
+
+const uint8_t g_kuiVlcTable_1[256][2] = //checked no error--
+{
+	{ 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, {14, 8}, {20, 8}, {19, 8}, {10, 8}, {29, 7}, {29, 7}, {16, 7}, {16, 7}, {15, 7}, {15, 7}, { 6, 7}, { 6, 7}, //15
+	{25, 6}, {25, 6}, {25, 6}, {25, 6}, {12, 6}, {12, 6}, {12, 6}, {12, 6}, {11, 6}, {11, 6}, {11, 6}, {11, 6}, { 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, //31
+	{21, 6}, {21, 6}, {21, 6}, {21, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 1, 6}, { 1, 6}, { 1, 6}, { 1, 6}, //47
+	{17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, {17, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, //63
+	{13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, //79
+	{ 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, //95
+	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //111
+	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //127
+	{ 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //143
+	{ 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //159
+	{ 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //175
+	{ 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, //191
+	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //207
+	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //223
+	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, //239
+	{ 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2} //255
+	
+};
+
+const uint8_t g_kuiVlcTable_1_0[64][2] = // read 6 bits  // for g_kuiVlcTable_1[0] //checked no error--
+{
+	{ 0, 0}, { 0, 0}, {57, 5}, {57, 5}, {61, 6}, {60, 6}, {59, 6}, {58, 6}, {55, 6}, {54, 6}, {56, 6}, {51, 6}, {52, 5}, {52, 5}, {50, 5}, {50, 5}, //15
+	{53, 5}, {53, 5}, {48, 5}, {48, 5}, {47, 5}, {47, 5}, {46, 5}, {46, 5}, {49, 5}, {49, 5}, {44, 5}, {44, 5}, {43, 5}, {43, 5}, {42, 5}, {42, 5}, //31
+	{38, 4}, {38, 4}, {38, 4}, {38, 4}, {40, 4}, {40, 4}, {40, 4}, {40, 4}, {39, 4}, {39, 4}, {39, 4}, {39, 4}, {34, 4}, {34, 4}, {34, 4}, {34, 4}, //47
+	{45, 4}, {45, 4}, {45, 4}, {45, 4}, {36, 4}, {36, 4}, {36, 4}, {36, 4}, {35, 4}, {35, 4}, {35, 4}, {35, 4}, {30, 4}, {30, 4}, {30, 4}, {30, 4} //63 
+};
+
+const uint8_t g_kuiVlcTable_1_1[8][2] = // read 3 bits // for g_kuiVlcTable_1[1] //checked no error--
+{
+	{41, 3}, {32, 3}, {31, 3}, {26, 3}, {37, 3}, {28, 3}, {27, 3}, {22, 3}	
+};
+
+const uint8_t g_kuiVlcTable_1_2[2][2] = // read 1 bit // for g_kuiVlcTable_1[2] //checked no error--
+{
+	{33, 1}, {24, 1}	
+};
+
+const uint8_t g_kuiVlcTable_1_3[2][2] = // read 1 bit // for g_kuiVlcTable_1[3] //checked no error--
+{
+	{23, 1}, {18, 1}	
+};
+
+const uint8_t g_kuiVlcTable_2[256][2] = //checked no error--
+{
+	{ 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, { 0, 0}, {45, 8}, {40, 8}, {35, 8}, {30, 8}, {41, 8}, {36, 8}, {31, 8}, {26, 8}, //15
+	{22, 7}, {22, 7}, {18, 7}, {18, 7}, {32, 7}, {32, 7}, {14, 7}, {14, 7}, {37, 7}, {37, 7}, {28, 7}, {28, 7}, {27, 7}, {27, 7}, {10, 7}, {10, 7}, //31
+	{ 6, 6}, { 6, 6}, { 6, 6}, { 6, 6}, {24, 6}, {24, 6}, {24, 6}, {24, 6}, {23, 6}, {23, 6}, {23, 6}, {23, 6}, { 3, 6}, { 3, 6}, { 3, 6}, { 3, 6}, //47
+	{33, 6}, {33, 6}, {33, 6}, {33, 6}, {20, 6}, {20, 6}, {20, 6}, {20, 6}, {19, 6}, {19, 6}, {19, 6}, {19, 6}, { 1, 6}, { 1, 6}, { 1, 6}, { 1, 6}, //63
+	{15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {15, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, {16, 5}, //79
+	{11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {11, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, //95
+	{ 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, { 7, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, {29, 5}, //111
+	{ 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 8, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, { 4, 5}, //127
+	{25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, {25, 4}, //143
+	{21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, {21, 4}, //159
+	{17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, {17, 4}, //175
+	{13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, {13, 4}, //191
+	{ 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, { 9, 4}, //207
+	{ 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, { 5, 4}, //223
+	{ 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, //239
+	{ 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4}, { 0, 4} //255
+};
+
+const uint8_t g_kuiVlcTable_2_0[4][2] = // read 2 bits // for g_kuiVlcTable_2[0] //checked
+{
+	{0, 0}, {58, 2}, {61, 2}, {60, 2}	
+};
+
+
+const uint8_t g_kuiVlcTable_2_1[4][2] = // read 2 bits // for g_kuiVlcTable_2[1] //checked
+{
+	{59, 2}, {54, 2}, {57, 2}, {56, 2}	
+};
+
+const uint8_t g_kuiVlcTable_2_2[4][2] = // read 2 bits // for g_kuiVlcTable_2[2] //checked
+{
+	{55, 2}, {50, 2}, {53, 2}, {52, 2}	
+};
+
+const uint8_t g_kuiVlcTable_2_3[4][2] = // read 2 bits // for g_kuiVlcTable_2[3] //checked
+{
+	{51, 2}, {46, 2}, {47, 1}, {47, 1}	
+};
+
+const uint8_t g_kuiVlcTable_2_4[2][2] = // read 1 bit // for g_kuiVlcTable_2[4] //checked
+{
+	{42, 1}, {48, 1}	
+};
+
+const uint8_t g_kuiVlcTable_2_5[2][2] = // read 1 bit // for g_kuiVlcTable_2[5] //checked
+{
+	{43, 1}, {38, 1}	
+};
+
+const uint8_t g_kuiVlcTable_2_6[2][2] = // read 1 bit // for g_kuiVlcTable_2[6] //checked no error--
+{
+	{49, 1}, {44, 1}	
+};
+
+const uint8_t g_kuiVlcTable_2_7[2][2] = // read 1 bit // for g_kuiVlcTable_2[7] //checked no error--
+{
+	{39, 1}, {34, 1}	
+};
+
+const uint8_t g_kuiVlcTable_3[64][2] = // read 6 bits //corrected
+{
+	{ 1, 6}, { 2, 6}, { 0, 0}, { 0, 6}, { 3, 6}, { 4, 6}, { 5, 6}, { 0, 0}, { 6, 6}, { 7, 6}, { 8, 6}, { 9, 6}, {10, 6}, {11, 6}, {12, 6}, {13, 6}, //15 
+	{14, 6}, {15, 6}, {16, 6}, {17, 6}, {18, 6}, {19, 6}, {20, 6}, {21, 6}, {22, 6}, {23, 6}, {24, 6}, {25, 6}, {26, 6}, {27, 6}, {28, 6}, {29, 6}, //31 
+	{30, 6}, {31, 6}, {32, 6}, {33, 6}, {34, 6}, {35, 6}, {36, 6}, {37, 6}, {38, 6}, {39, 6}, {40, 6}, {41, 6}, {42, 6}, {43, 6}, {44, 6}, {45, 6}, //47
+	{46, 6}, {47, 6}, {48, 6}, {49, 6}, {50, 6}, {51, 6}, {52, 6}, {53, 6}, {54, 6}, {55, 6}, {56, 6}, {57, 6}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, //63
+};
+
+
+const uint8_t g_kuiVlcTableNeedMoreBitsThread[3] = 
+{
+	4, 4, 8
+};
+
+const uint8_t g_kuiVlcTableMoreBitsCount0[4] = 
+{
+	8, 2, 1, 1
+};
+
+const uint8_t g_kuiVlcTableMoreBitsCount1[4] = 
+{
+	6, 3, 1, 1
+};
+
+const uint8_t g_kuiVlcTableMoreBitsCount2[8] = 
+{
+	2, 2, 2, 2, 1, 1, 1, 1
+};
+
+const uint8_t g_kuiNcMapTable[17] = 
+{
+	0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+
+const uint8_t g_kuiVlcTrailingOneTotalCoeffTable[62][2] = 
+{
+	{0, 0}, 
+	{0, 1}, {1, 1}, 
+	{0, 2}, {1, 2}, {2, 2}, 
+	{0, 3}, {1, 3}, {2, 3}, {3, 3},
+	{0, 4}, {1, 4}, {2, 4}, {3, 4},
+	{0, 5}, {1, 5}, {2, 5}, {3, 5},
+	{0, 6}, {1, 6}, {2, 6}, {3, 6},
+	{0, 7}, {1, 7}, {2, 7}, {3, 7},
+	{0, 8}, {1, 8}, {2, 8}, {3, 8},
+	{0, 9}, {1, 9}, {2, 9}, {3, 9},
+	{0, 10}, {1, 10}, {2, 10}, {3, 10},
+	{0, 11}, {1, 11}, {2, 11}, {3, 11},
+	{0, 12}, {1, 12}, {2, 12}, {3, 12},
+	{0, 13}, {1, 13}, {2, 13}, {3, 13},
+	{0, 14}, {1, 14}, {2, 14}, {3, 14},
+	{0, 15}, {1, 15}, {2, 15}, {3, 15},
+	{0, 16}, {1, 16}, {2, 16}, {3, 16}  
+};
+
+const uint8_t g_kuiTotalZerosTable0[512][2] = //read 9 bits, generated by tzVlcIndex=1 in Table 9-7 in H.264/AVC standard
+{
+	{0, 0}, {15, 9}, {14, 9}, {13, 9}, {12, 8}, {12, 8}, {11, 8}, {11, 8}, {10, 7}, {10, 7}, {10, 7}, {10, 7}, {9, 7}, {9, 7}, {9, 7}, {9, 7}, //15
+	{8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 8, 6}, { 7, 6}, { 7, 6}, { 7, 6}, { 7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, //31
+	{6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, { 6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, //47
+	{5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, { 5, 5}, {5, 5}, {5, 5}, {5, 5}, {5, 5}, //63
+	{4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, {4, 4}, {4, 4}, {4, 4}, {4, 4}, //79
+	{4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, {4, 4}, {4, 4}, {4, 4}, {4, 4}, //95
+	{3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, {3, 4}, {3, 4}, {3, 4}, {3, 4}, //111
+	{3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, { 3, 4}, {3, 4}, {3, 4}, {3, 4}, {3, 4}, //127
+	{2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //143
+	{2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //159
+	{2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //175
+	{2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //191
+	{1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //207
+	{1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //223
+	{1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //239
+	{1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //255
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //271
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //287
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //303
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //319
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //335
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //351
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //367
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //383
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //399
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //415
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //431
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //447
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //463
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //479
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, //495
+	{0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1} //511
+};
+
+const uint8_t g_kuiTotalZerosTable1[64][2] = //read 6 bits, generated by tzVlcIndex=2 in Table 9-7 in H.264/AVC standard
+{
+	{14, 6}, {13, 6}, {12, 6}, {11, 6}, {10, 5}, {10, 5}, {9, 5}, {9, 5}, {8, 4}, {8, 4}, {8, 4}, {8, 4}, {7, 4}, {7, 4}, {7, 4}, {7, 4}, //15
+	{ 6, 4}, { 6, 4}, { 6, 4}, { 6, 4}, { 5, 4}, { 5, 4}, {5, 4}, {5, 4}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, //31
+	{ 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, {3, 3}, {3, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, {2, 3}, //47
+	{ 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, { 1, 3}, {1, 3}, {1, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3}, {0, 3} //63
+};
+
+const uint8_t g_kuiTotalZerosTable2[64][2] = //read 6 bits, generated by tzVlcIndex=3 in Table 9-7 in H.264/AVC standard
+{
+	{13, 6}, {11, 6}, {12, 5}, {12, 5}, {10, 5}, {10, 5}, {9, 5}, {9, 5}, {8, 4}, {8, 4}, {8, 4}, {8, 4}, {5, 4}, {5, 4}, {5, 4}, {5, 4}, //15
+	{ 4, 4}, { 4, 4}, { 4, 4}, { 4, 4}, { 0, 4}, { 0, 4}, {0, 4}, {0, 4}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, {7, 3}, //31
+	{ 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, {6, 3}, {6, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3}, //47
+	{ 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, {2, 3}, {2, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3} //63
+};
+
+const uint8_t g_kuiTotalZerosTable3[32][2] = //read 5 bits, generated by tzVlcIndex=4 in Table 9-7 in H.264/AVC standard
+{
+	{12, 5}, {11, 5}, {10, 5}, {0, 5}, {9, 4}, {9, 4}, {7, 4}, {7, 4}, {3, 4}, {3, 4}, {2, 4}, {2, 4}, {8, 3}, {8, 3}, {8, 3}, {8, 3}, //15
+	{ 6, 3}, { 6, 3}, { 6, 3}, {6, 3}, {5, 3}, {5, 3}, {5, 3}, {5, 3}, {4, 3}, {4, 3}, {4, 3}, {4, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3}, //31
+};
+ 
+const uint8_t g_kuiTotalZerosTable4[32][2] = //read 5 bits, generated by tzVlcIndex=5 in Table 9-7 in H.264/AVC standard
+{
+	{11, 5}, { 9, 5}, {10, 4}, {10, 4}, { 8, 4}, { 8, 4}, { 2, 4}, { 2, 4}, { 1, 4}, { 1, 4}, { 0, 4}, { 0, 4}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, //15
+	{ 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3} //31
+};
+
+const uint8_t g_kuiTotalZerosTable5[64][2] = //read 6 bits, generated by tzVlcIndex=6 in Table 9-7 in H.264/AVC standard
+{
+	{10, 6}, { 0, 6}, { 1, 5}, { 1, 5}, { 8, 4}, { 8, 4}, { 8, 4}, { 8, 4}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, //15
+	{ 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, //31
+	{ 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, //47
+	{ 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3} //63
+};
+
+const uint8_t g_kuiTotalZerosTable6[64][2] = //read 6 bits, generated by tzVlcIndex=7 in Table 9-7 in H.264/AVC standard
+{
+	{ 9, 6}, { 0, 6}, { 1, 5}, { 1, 5}, { 7, 4}, { 7, 4}, { 7, 4}, { 7, 4}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, //15
+	{ 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, { 4, 3}, //31
+	{ 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, //47
+	{ 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2} //63
+};
+
+const uint8_t g_kuiTotalZerosTable7[64][2] = //read 6 bits, generated by tzVlcIndex=8 in Table 9-7 in H.264/AVC standard
+{
+	{ 8, 6}, { 0, 6}, { 2, 5}, { 2, 5}, { 1, 4}, { 1, 4}, { 1, 4}, { 1, 4}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, { 7, 3}, //15
+	{ 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 6, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, { 3, 3}, //31
+	{ 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, //47
+	{ 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2} //63
+};
+
+const uint8_t g_kuiTotalZerosTable8[64][2] = //read 6 bits, generated by tzVlcIndex=9 in Table 9-7 in H.264/AVC standard
+{
+	{ 1, 6}, { 0, 6}, { 7, 5}, { 7, 5}, { 2, 4}, { 2, 4}, { 2, 4}, { 2, 4}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, { 5, 3}, //15
+	{ 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, //31
+	{ 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, //47
+	{ 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2} //63
+};
+
+const uint8_t g_kuiTotalZerosTable9[32][2] = //read 5 bits, generated by tzVlcIndex=10 in Table 9-7 in H.264/AVC standard
+{
+	{ 1, 5}, { 0, 5}, { 6, 4}, { 6, 4}, { 2, 3}, { 2, 3}, { 2, 3}, { 2, 3}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, { 5, 2}, //15
+	{ 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 4, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2}, { 3, 2} //31
+};
+
+const uint8_t g_kuiTotalZerosTable10[16][2] = //read 4 bits, generated by tzVlcIndex=11 in Table 9-7 in H.264/AVC standard
+{
+	{ 0, 4}, { 1, 4}, { 2, 3}, { 2, 3}, { 3, 3}, { 3, 3}, { 5, 3}, { 5, 3}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1}, { 4, 1} //15
+};
+
+const uint8_t g_kuiTotalZerosTable11[16][2] = //read 4 bits, generated by tzVlcIndex=12 in Table 9-7 in H.264/AVC standard
+{
+	{ 0, 4}, { 1, 4}, { 4, 3}, { 4, 3}, { 2, 2}, { 2, 2}, { 2, 2}, { 2, 2}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1}, { 3, 1} //15
+};
+
+const uint8_t g_kuiTotalZerosTable12[8][2] = //read 3 bits, generated by tzVlcIndex=13 in Table 9-7 in H.264/AVC standard
+{
+	{ 0, 3}, { 1, 3}, { 3, 2}, { 3, 2}, { 2, 1}, { 2, 1}, { 2, 1}, { 2, 1} //8
+};
+
+const uint8_t g_kuiTotalZerosTable13[4][2] = //read 2 bits, generated by tzVlcIndex=14 in Table 9-7 in H.264/AVC standard
+{
+	{ 0, 2}, { 1, 2}, { 2, 1}, { 2, 1}
+};
+
+const uint8_t g_kuiTotalZerosTable14[2][2] = //read 1 bits generated by tzVlcIndex=15 in Table 9-7 in H.264/AVC standard
+{
+	{ 0, 1}, { 1, 1} 
+};
+
+const uint8_t g_kuiTotalZerosBitNumMap[15] = 
+{
+	9, 6, 6, 5, 5, 6, 6, 6, 6, 5, 4, 4, 3, 2, 1	
+};
+
+
+const uint8_t g_kuiTotalZerosChromaTable0[8][2] = //read 3 bits, generated by tzVlcIndex=1 in Table 9-9(a) in H.264/AVC standard
+{
+	{ 3, 3}, { 2, 3}, { 1, 2}, { 1, 2}, { 0, 1}, { 0, 1}, { 0, 1}, { 0, 1}
+};
+
+const uint8_t g_kuiTotalZerosChromaTable1[4][2] = //read 2 bits, generated by tzVlcIndex=2 in Table 9-9(a) in H.264/AVC standard
+{
+	{ 2, 2}, { 1, 2}, { 0, 1}, { 0, 1}
+};
+
+const uint8_t g_kuiTotalZerosChromaTable2[2][2] = //read 1 bits, generated by tzVlcIndex=3 in Table 9-9(a) in H.264/AVC standard
+{
+	{ 1, 1}, { 0, 1}
+};
+
+const uint8_t g_kuiTotalZerosBitNumChromaMap[3] = 
+{
+	3, 2, 1
+};
+
+const uint8_t g_kuiZeroLeftTable0[2][2] = //read 1 bits
+{
+	{1, 1}, {0, 1}
+};
+
+const uint8_t g_kuiZeroLeftTable1[4][2] = //read 2 bits
+{
+	{2, 2}, {1, 2}, {0, 1}, {0, 1}
+};
+
+const uint8_t g_kuiZeroLeftTable2[4][2] = //read 2 bits
+{
+	{3, 2}, {2, 2}, {1, 2}, {0, 2}
+};
+
+const uint8_t g_kuiZeroLeftTable3[8][2] = //read 3 bits
+{
+	{4, 3}, {3, 3}, {2, 2}, {2, 2}, {1, 2}, {1, 2}, {0, 2}, {0, 2}
+};
+
+const uint8_t g_kuiZeroLeftTable4[8][2] = //read 3 bits
+{
+	{5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 2}, {1, 2}, {0, 2}, {0, 2}
+};
+
+const uint8_t g_kuiZeroLeftTable5[8][2] = //read 3 bits
+{
+	{1, 3}, {2, 3}, {4, 3}, {3, 3}, {6, 3}, {5, 3}, {0, 2}, {0, 2}
+};
+
+const uint8_t g_kuiZeroLeftTable6[8][2] = //read 3 bits
+{
+	{7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 3}, {0, 3}
+};
+
+const uint8_t g_kuiZeroLeftBitNumMap[16] = 
+{
+	0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3	
+};
+
+} // namespace WelsDec
--- /dev/null
+++ b/codec/decoder/core/src/expand_pic.cpp
@@ -1,0 +1,165 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <string.h>
+
+#include "typedefs.h"
+#include "expand_pic.h"
+#include "cpu_core.h"
+
+namespace WelsDec {
+
+// rewrite it (split into luma & chroma) that is helpful for mmx/sse2 optimization perform, 9/27/2009
+static inline void_t ExpandPictureLuma_c( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicWidth, const int32_t kiPicHeight )
+{
+	uint8_t *pTmp				= pDst;
+	uint8_t *pDstLastLine		= pTmp + (kiPicHeight-1) * kiStride;	
+	const int32_t kiPaddingLen	= PADDING_LENGTH;	
+	const uint8_t kuiTopLeft	= pTmp[0];
+	const uint8_t kuiTopRight	= pTmp[kiPicWidth-1];
+	const uint8_t kuiBottomLeft	= pDstLastLine[0];
+	const uint8_t kuiBottomRight= pDstLastLine[kiPicWidth-1];
+	int32_t i					= 0;
+
+	do {
+		const int32_t kiStrides	= (1+i) * kiStride;
+		uint8_t* pTop			= pTmp - kiStrides;
+		uint8_t* pBottom		= pDstLastLine + kiStrides;
+		
+		// pad pTop and pBottom
+		memcpy(pTop, pTmp, kiPicWidth);
+		memcpy(pBottom, pDstLastLine, kiPicWidth);
+		
+		// pad corners
+		memset(pTop-kiPaddingLen,    kuiTopLeft,     kiPaddingLen); //pTop left
+		memset(pTop+kiPicWidth,      kuiTopRight,    kiPaddingLen); //pTop right
+		memset(pBottom-kiPaddingLen, kuiBottomLeft,  kiPaddingLen); //pBottom left
+		memset(pBottom+kiPicWidth,   kuiBottomRight, kiPaddingLen); //pBottom right
+		
+		++ i;
+	} while( i < kiPaddingLen );
+
+	// pad left and right
+	i = 0;
+	do {
+		memset(pTmp-kiPaddingLen, pTmp[0], kiPaddingLen);
+		memset(pTmp+kiPicWidth, pTmp[kiPicWidth-1], kiPaddingLen);
+
+		pTmp += kiStride;
+		++ i;
+	} while( i < kiPicHeight );
+}
+
+static inline void_t ExpandPictureChroma_c( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicWidth, const int32_t kiPicHeight )
+{
+	uint8_t *pTmp				= pDst;
+	uint8_t *pDstLastLine		= pTmp + (kiPicHeight-1) * kiStride;	
+	const int32_t kiPaddingLen	= (PADDING_LENGTH>>1);	
+	const uint8_t kuiTopLeft	= pTmp[0];
+	const uint8_t kuiTopRight	= pTmp[kiPicWidth-1];
+	const uint8_t kuiBottomLeft	= pDstLastLine[0];
+	const uint8_t kuiBottomRight= pDstLastLine[kiPicWidth-1];
+	int32_t i					= 0;
+	
+	do {
+		const int32_t kiStrides	= (1+i) * kiStride;
+		uint8_t* pTop			= pTmp - kiStrides;
+		uint8_t* pBottom		= pDstLastLine + kiStrides;
+		
+		// pad pTop and pBottom
+		memcpy(pTop, pTmp, kiPicWidth);
+		memcpy(pBottom, pDstLastLine, kiPicWidth);
+		
+		// pad corners
+		memset(pTop-kiPaddingLen,    kuiTopLeft,     kiPaddingLen); //pTop left
+		memset(pTop+kiPicWidth,      kuiTopRight,    kiPaddingLen); //pTop right
+		memset(pBottom-kiPaddingLen, kuiBottomLeft,  kiPaddingLen); //pBottom left
+		memset(pBottom+kiPicWidth,   kuiBottomRight, kiPaddingLen); //pBottom right
+		
+		++ i;
+	} while( i < kiPaddingLen );
+	
+	// pad left and right
+	i = 0;
+	do {
+		memset(pTmp-kiPaddingLen, pTmp[0], kiPaddingLen);
+		memset(pTmp+kiPicWidth, pTmp[kiPicWidth-1], kiPaddingLen);
+		
+		pTmp += kiStride;
+		++ i;
+	} while( i < kiPicHeight );
+}
+
+void_t InitExpandPictureFunc( SExpandPicFunc *pExpandPicFunc, const uint32_t kuiCpuFlags )
+{
+	pExpandPicFunc->pExpandLumaPicture	= ExpandPictureLuma_c;
+	pExpandPicFunc->pExpandChromaPicture[0] = ExpandPictureChroma_c;
+	pExpandPicFunc->pExpandChromaPicture[1] = ExpandPictureChroma_c;
+
+#if defined(X86_ASM)
+	if ( (kuiCpuFlags & WELS_CPU_SSE2) == WELS_CPU_SSE2 )
+	{
+		pExpandPicFunc->pExpandLumaPicture	   = ExpandPictureLuma_sse2;
+		pExpandPicFunc->pExpandChromaPicture[0] = ExpandPictureChromaUnalign_sse2;
+		pExpandPicFunc->pExpandChromaPicture[1] = ExpandPictureChromaAlign_sse2;
+	}
+#endif//X86_ASM
+}
+
+void_t ExpandReferencingPicture(PPicture pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChroma[2])
+{
+	/*local variable*/
+	uint8_t *pPicY = pPic->pData[0];
+	uint8_t *pPicCb = pPic->pData[1];
+	uint8_t *pPicCr = pPic->pData[2];
+	const int32_t kiWidthY	= pPic->iWidthInPixel;
+	const int32_t kiHeightY	= pPic->iHeightInPixel;
+	const int32_t kiWidthUV	= kiWidthY >> 1;
+	const int32_t kiHeightUV= kiHeightY >> 1;	
+	
+    pExpLuma(pPicY, pPic->iLinesize[0], kiWidthY, kiHeightY);	
+	if ( kiWidthUV >= 16 )
+	{
+		// fix coding picture size as 16x16 issues 7/27/2010
+		const bool_t kbChrAligned= /*(kiWidthUV >= 16) && */((kiWidthUV & 0x0F) == 0);	// chroma planes: (16+kiWidthUV) & 15
+		pExpChroma[kbChrAligned](pPicCb, pPic->iLinesize[1], kiWidthUV, kiHeightUV);
+		pExpChroma[kbChrAligned](pPicCr, pPic->iLinesize[2], kiWidthUV, kiHeightUV);
+	}
+	else
+	{
+		// fix coding picture size as 16x16 issues 7/27/2010
+		ExpandPictureChroma_c(pPicCb, pPic->iLinesize[1], kiWidthUV, kiHeightUV);
+		ExpandPictureChroma_c(pPicCr, pPic->iLinesize[2], kiWidthUV, kiHeightUV);
+	}
+}
+
+} // namespace WelsDec
--- /dev/null
+++ b/codec/decoder/core/src/fmo.cpp
@@ -1,0 +1,355 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	fmo.c
+ *
+ * \brief	Flexible Macroblock Ordering implementation
+ *
+ * \date	2/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+
+#include "fmo.h" 
+#include "macros.h"
+#include "utils.h"
+#include "mem_align.h"
+
+namespace WelsDec {
+
+/*!
+ * \brief	Generate MB allocated map for interleaved slice group (TYPE 0)
+ *
+ * \param	pFmo	fmo context
+ * \param	pPps	pps context 
+ *
+ * \return	0 - successful; none 0 - failed
+ */
+static inline int32_t FmoGenerateMbAllocMapType0( PFmo pFmo, PPps pPps )
+{
+	uint32_t uiNumSliceGroups = 0;
+	int32_t iMbNum = 0;
+	int32_t i = 0;
+
+	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo || NULL == pPps ) )		
+	uiNumSliceGroups = pPps->uiNumSliceGroups;
+	iMbNum = pFmo->iCountMbNum;
+	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo->pMbAllocMap || iMbNum <= 0 || uiNumSliceGroups >= MAX_SLICEGROUP_IDS ) )
+	
+	do
+	{
+		uint8_t uiGroup = 0;
+		do {
+			const int32_t kiRunIdx = pPps->uiRunLength[uiGroup];
+			int32_t j = 0;
+			do {
+				pFmo->pMbAllocMap[i+j] = uiGroup;
+				++ j;
+			} while(j < kiRunIdx && i + j < iMbNum);
+			i += kiRunIdx;
+			++ uiGroup;
+		} while(uiGroup < uiNumSliceGroups && i < iMbNum);
+	}while(i < iMbNum);
+	
+	return 0; // well here
+}
+
+/*!
+ * \brief	Generate MB allocated map for dispersed slice group (TYPE 1)	
+ *
+ * \param	pFmo	fmo context
+ * \param	pPps	pps context
+ * \param	iMbWidth	MB width
+ *
+ * \return	0 - successful; none 0 - failed
+ */
+static inline int32_t FmoGenerateMbAllocMapType1( PFmo pFmo, PPps pPps, const int32_t kiMbWidth )
+{
+	uint32_t uiNumSliceGroups = 0;
+	int32_t iMbNum = 0;
+	int16_t i = 0;
+	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo || NULL == pPps ) )
+	uiNumSliceGroups = pPps->uiNumSliceGroups;
+	iMbNum			 = pFmo->iCountMbNum;
+	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo->pMbAllocMap || iMbNum <= 0 || kiMbWidth == 0  || uiNumSliceGroups >= MAX_SLICEGROUP_IDS ) )
+		
+	do	
+	{
+		pFmo->pMbAllocMap[i] = (uint8_t)(((i % kiMbWidth)+(((i / kiMbWidth)*uiNumSliceGroups)>>1)) % uiNumSliceGroups);
+		++ i;
+	}while (i < iMbNum);
+	
+	return 0; // well here
+}
+
+/*!
+ * \brief	Generate MB allocated map for various type of slice group cases (TYPE 0, .., 6)
+ *
+ * \param	pFmo		fmo context 
+ * \param	pPps		pps context
+ * \param	kiMbWidth	MB width
+ * \param	kiMbHeight	MB height
+ *
+ * \return	0 - successful; none 0 - failed	
+ */
+static inline int32_t FmoGenerateSliceGroup( PFmo pFmo, const PPps kpPps, const int32_t kiMbWidth, const int32_t kiMbHeight )
+{
+	int32_t iNumMb	= 0;
+	int32_t iErr		= 0;
+	bool_t	bResolutionChanged = false;
+
+	// the cases we would not like
+	WELS_VERIFY_RETURN_IF( 1, ( NULL == pFmo || NULL == kpPps ) )
+	
+	iNumMb	= pFmo->iCountMbNum;
+
+	iNumMb = kiMbWidth * kiMbHeight;
+	
+	if ( 0 == iNumMb )
+		return 1;		
+
+
+    WelsFree(pFmo->pMbAllocMap, "_fmo->pMbAllocMap");
+	pFmo->pMbAllocMap	= (uint8_t *)WelsMalloc( iNumMb * sizeof(uint8_t), "_fmo->pMbAllocMap" );		
+	WELS_VERIFY_RETURN_IF( 1, (NULL == pFmo->pMbAllocMap) )	// out of memory		
+	
+	pFmo->iCountMbNum	= iNumMb;		
+
+	if ( kpPps->uiNumSliceGroups < 2 && iNumMb > 0) // only one slice group, exactly it is single slice based
+	{		
+		memset ( pFmo->pMbAllocMap, 0,  iNumMb * sizeof(int8_t));	// for safe
+		
+		pFmo->iSliceGroupCount		= 1;
+		
+		return 0;
+	}	
+		
+	if ( bResolutionChanged || ((int32_t)kpPps->uiSliceGroupMapType != pFmo->iSliceGroupType) 
+			|| ((int32_t)kpPps->uiNumSliceGroups != pFmo->iSliceGroupCount)	)
+	{
+		switch ( kpPps->uiSliceGroupMapType )
+		{
+		case 0:
+			iErr	= FmoGenerateMbAllocMapType0( pFmo, kpPps );			
+			break;
+		case 1:			
+			iErr = FmoGenerateMbAllocMapType1( pFmo, kpPps, kiMbWidth );
+			break;
+		case 2:
+		case 3:
+		case 4:
+		case 5:
+		case 6:
+			// Reserve for others slice group type
+			iErr	= 1;
+			break;
+		default:
+			return 1;
+		}
+	}
+	
+	if ( 0 == iErr )	// well now
+	{
+		pFmo->iSliceGroupCount	= kpPps->uiNumSliceGroups;
+		pFmo->iSliceGroupType	= kpPps->uiSliceGroupMapType;
+	}
+
+	return iErr;
+}
+
+/*!
+ * \brief	Initialize Wels Flexible Macroblock Ordering (FMO)
+ *
+ * \param	pFmo		Wels fmo to be initialized
+ * \param	pPps	pps argument
+ * \param	kiMbWidth	mb width
+ * \param	kiMbHeight	mb height
+ *
+ * \return	0 - successful; none 0 - failed;
+ */
+int32_t	InitFmo( PFmo pFmo, PPps pPps, const int32_t kiMbWidth, const int32_t kiMbHeight )
+{
+	return FmoGenerateSliceGroup( pFmo, pPps, kiMbWidth, kiMbHeight );
+}
+
+
+/*!
+ * \brief	Uninitialize Wels Flexible Macroblock Ordering (FMO) list
+ *
+ * \param	pFmo		Wels base fmo ptr to be uninitialized
+ * \param	kiCnt		count number of PPS per list
+ * \param	kiAvail		count available number of PPS in list
+ *
+ * \return	NONE
+ */
+void_t UninitFmoList( PFmo pFmo, const int32_t kiCnt, const int32_t kiAvail )
+{
+	PFmo pIter = pFmo;
+	int32_t i = 0;
+	int32_t iFreeNodes = 0;
+
+	if ( NULL == pIter || kiAvail <= 0 || kiCnt < kiAvail )
+		return;
+
+	while ( i < kiCnt ) {
+		if ( pIter != NULL && pIter->bActiveFlag )
+		{
+			if ( NULL != pIter->pMbAllocMap )
+			{
+				WelsFree( pIter->pMbAllocMap, "pIter->pMbAllocMap" );
+
+				pIter->pMbAllocMap	= NULL;
+			}
+			pIter->iSliceGroupCount	= 0;
+			pIter->iSliceGroupType	= -1;
+			pIter->iCountMbNum		= 0;
+			pIter->bActiveFlag		= false;
+			++ iFreeNodes;
+			if ( iFreeNodes >= kiAvail )
+				break;
+		}
+		++ pIter;
+		++ i;
+	}
+}
+
+/*!
+ * \brief	detect parameter sets are changed or not
+ *
+ * \param	pFmo				fmo context
+ * \param	kiCountNumMb		(iMbWidth * iMbHeight) in Sps
+ * \param	iSliceGroupType	slice group type if fmo is exactly enabled
+ * \param	iSliceGroupCount	slice group count if fmo is exactly enabled
+ *
+ * \return	true - changed or not initialized yet; false - not change at all
+ */
+bool_t FmoParamSetsChanged( PFmo pFmo, const int32_t kiCountNumMb, const int32_t kiSliceGroupType, const int32_t kiSliceGroupCount )
+{
+	WELS_VERIFY_RETURN_IF( false, (NULL == pFmo) )
+	
+	return  ( (!pFmo->bActiveFlag)
+			|| (kiCountNumMb != pFmo->iCountMbNum)
+			|| (kiSliceGroupType != pFmo->iSliceGroupType)
+			|| (kiSliceGroupCount != pFmo->iSliceGroupCount) );
+}
+
+/*!
+ * \brief	update/insert FMO parameter unit
+ *
+ * \param	_fmo	FMO context
+ * \param	_sps	PSps
+ * \param	_pps	PPps
+ * \param	pActiveFmoNum	int32_t* [in/out]
+ *
+ * \return	true - update/insert successfully; false - failed;
+ */
+bool_t FmoParamUpdate( PFmo pFmo, PSps pSps, PPps pPps, int32_t *pActiveFmoNum )
+{
+	const uint32_t kuiMbWidth = pSps->iMbWidth;
+	const uint32_t kuiMbHeight= pSps->iMbHeight;
+
+	if ( FmoParamSetsChanged(	pFmo,
+									kuiMbWidth * kuiMbHeight,
+									pPps->uiSliceGroupMapType,
+									pPps->uiNumSliceGroups	) )
+	{
+
+		if ( InitFmo( pFmo, pPps, kuiMbWidth, kuiMbHeight ) )
+		{
+			return false;
+		}
+		else
+		{
+			if ( !pFmo->bActiveFlag && *pActiveFmoNum < MAX_PPS_COUNT )
+			{
+				++ (*pActiveFmoNum);
+				pFmo->bActiveFlag	= true;
+			}
+		}
+	}
+
+	return true;
+}
+
+/*!
+ * \brief	Convert kMbXy to slice group idc correspondingly
+ *
+ * \param	pFmo		Wels fmo context
+ * \param	kMbXy		kMbXy to be converted 
+ *
+ * \return	slice group idc - successful; -1 - failed;
+ */
+int32_t FmoMbToSliceGroup( PFmo pFmo, const MB_XY_T kiMbXy )
+{
+	const int32_t kiMbNum	= pFmo->iCountMbNum;
+	const uint8_t* kpMbMap	= pFmo->pMbAllocMap;
+	
+	if ( kiMbXy < 0 || kiMbXy >= kiMbNum || kpMbMap == NULL)
+		return -1;
+	
+	return kpMbMap[ kiMbXy ];
+}
+
+/*!
+ * \brief	Get successive mb to be processed with given current kMbXy
+ *
+ * \param	pFmo			Wels fmo context
+ * \param	kMbXy			current kMbXy
+ *
+ * \return	iNextMb - successful; -1 - failed;
+ */
+MB_XY_T FmoNextMb( PFmo pFmo, const MB_XY_T kiMbXy )
+{
+	const int32_t kiTotalMb			= pFmo->iCountMbNum;
+	const uint8_t* kpMbMap			= pFmo->pMbAllocMap;
+	MB_XY_T iNextMb					= kiMbXy;
+	const uint8_t kuiSliceGroupIdc	= (uint8_t)FmoMbToSliceGroup( pFmo, kiMbXy );
+	
+	if (kuiSliceGroupIdc == (uint8_t)(-1))
+		return -1;
+	
+	do {
+		++ iNextMb;
+		if (iNextMb >= kiTotalMb){
+			iNextMb	= -1;
+			break;
+		}
+		if (kpMbMap[iNextMb] == kuiSliceGroupIdc){
+			break;
+		}
+	} while( 1 );
+	
+	// -1: No further MB in this slice (could be end of picture)
+	return iNextMb;
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/get_intra_predictor.cpp
@@ -1,0 +1,700 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	get_intra_predictor.c
+ *
+ * \brief	implementation for get intra predictor about 16x16, 4x4, chroma.
+ *
+ * \date	4/2/2009 Created
+ *			9/14/2009 C level based optimization with high performance gained.
+ *				[const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+
+#include "macros.h"
+#include "ls_defines.h"
+#include "get_intra_predictor.h"
+
+namespace WelsDec {
+
+#define I4x4_COUNT 4
+#define I8x8_COUNT 8
+#define I16x16_COUNT 16
+
+void_t WelsI4x4LumaPredV_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const uint32_t kuiVal = LD32(pPred-kiStride);
+
+	ST32( pPred						    , kuiVal );
+	ST32( pPred+kiStride				, kuiVal );
+	ST32( pPred+(kiStride<<1)			, kuiVal );
+	ST32( pPred+(kiStride<<1)+kiStride	, kuiVal );	
+}
+
+void_t WelsI4x4LumaPredH_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2 = kiStride << 1;
+	const int32_t kiStride3 = kiStride2 + kiStride;
+	const uint32_t kuiL0 = 0x01010101U * pPred[-1          ];
+	const uint32_t kuiL1 = 0x01010101U * pPred[-1+kiStride ];
+	const uint32_t kuiL2 = 0x01010101U * pPred[-1+kiStride2];
+	const uint32_t kuiL3 = 0x01010101U * pPred[-1+kiStride3];
+
+	ST32( pPred          , kuiL0 );
+	ST32( pPred+kiStride , kuiL1 );
+	ST32( pPred+kiStride2, kuiL2 );
+	ST32( pPred+kiStride3, kuiL3 );	
+}
+
+void_t WelsI4x4LumaPredDc_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	= kiStride << 1;
+	const int32_t kiStride3	= kiStride2 + kiStride;
+	const uint8_t kuiMean	= (	pPred[-1] + pPred[-1+kiStride] + pPred[-1+kiStride2] + pPred[-1+kiStride3] +
+								pPred[-kiStride] + pPred[-kiStride+1] + pPred[-kiStride+2] + pPred[-kiStride+3] + 4 ) >> 3;
+	const uint32_t kuiMean32= 0x01010101U * kuiMean;
+
+	ST32( pPred          , kuiMean32 );
+	ST32( pPred+kiStride , kuiMean32 );
+	ST32( pPred+kiStride2, kuiMean32 );
+	ST32( pPred+kiStride3, kuiMean32 );	
+}
+
+void_t WelsI4x4LumaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	= kiStride << 1;
+	const int32_t kiStride3	= kiStride2 + kiStride;
+	const uint8_t kuiMean	= ( pPred[-1] + pPred[-1+kiStride] + pPred[-1+kiStride2] + pPred[-1+kiStride3] + 2 ) >> 2;
+	const uint32_t kuiMean32= 0x01010101U * kuiMean;
+
+	ST32( pPred          , kuiMean32 );
+	ST32( pPred+kiStride , kuiMean32 );
+	ST32( pPred+kiStride2, kuiMean32 );
+	ST32( pPred+kiStride3, kuiMean32 );	
+}
+
+void_t WelsI4x4LumaPredDcTop_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	= kiStride << 1;
+	const int32_t kiStride3	= kiStride2 + kiStride;
+	const uint8_t kuiMean	= (pPred[-kiStride] + pPred[-kiStride+1] + pPred[-kiStride+2] + pPred[-kiStride+3] + 2) >> 2;
+	const uint32_t kuiMean32= 0x01010101U * kuiMean;
+
+	ST32( pPred          , kuiMean32 );
+	ST32( pPred+kiStride , kuiMean32 );
+	ST32( pPred+kiStride2, kuiMean32 );
+	ST32( pPred+kiStride3, kuiMean32 );	
+}
+
+void_t WelsI4x4LumaPredDcNA_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const uint32_t kuiDC32		= 0x80808080U;
+
+	ST32( pPred                       , kuiDC32 );
+	ST32( pPred+kiStride              , kuiDC32 );
+	ST32( pPred+(kiStride<<1)         , kuiDC32 );
+	ST32( pPred+(kiStride<<1)+kiStride, kuiDC32 );
+}
+
+/*down pLeft*/
+void_t WelsI4x4LumaPredDDL_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	= kiStride<<1;
+	const int32_t kiStride3	= kiStride + kiStride2;
+	/*get pTop*/
+	uint8_t *ptop			= &pPred[-kiStride];
+	const uint8_t kuiT0		= *ptop;
+	const uint8_t kuiT1		= *(ptop+1);
+	const uint8_t kuiT2		= *(ptop+2);
+	const uint8_t kuiT3		= *(ptop+3);
+	const uint8_t kuiT4		= *(ptop+4);
+	const uint8_t kuiT5		= *(ptop+5);
+	const uint8_t kuiT6		= *(ptop+6);
+	const uint8_t kuiT7		= *(ptop+7);
+	const uint8_t kuiDDL0	= (2 + kuiT0 + kuiT2 + (kuiT1<<1))>>2;	// kDDL0
+	const uint8_t kuiDDL1	= (2 + kuiT1 + kuiT3 + (kuiT2<<1))>>2;	// kDDL1
+	const uint8_t kuiDDL2	= (2 + kuiT2 + kuiT4 + (kuiT3<<1))>>2;	// kDDL2
+	const uint8_t kuiDDL3	= (2 + kuiT3 + kuiT5 + (kuiT4<<1))>>2;	// kDDL3
+	const uint8_t kuiDDL4	= (2 + kuiT4 + kuiT6 + (kuiT5<<1))>>2;	// kDDL4
+	const uint8_t kuiDDL5	= (2 + kuiT5 + kuiT7 + (kuiT6<<1))>>2;	// kDDL5
+	const uint8_t kuiDDL6	= (2 + kuiT6 + kuiT7 + (kuiT7<<1))>>2;	// kDDL6
+	const uint8_t kuiList[8]= { kuiDDL0, kuiDDL1, kuiDDL2, kuiDDL3, kuiDDL4, kuiDDL5, kuiDDL6, 0 };
+
+	ST32( pPred          , LD32(kuiList  ) );
+	ST32( pPred+kiStride , LD32(kuiList+1) );
+	ST32( pPred+kiStride2, LD32(kuiList+2) );
+	ST32( pPred+kiStride3, LD32(kuiList+3) );
+}
+
+/*down pLeft*/
+void_t WelsI4x4LumaPredDDLTop_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	= kiStride<<1;
+	const int32_t kiStride3	= kiStride + kiStride2;
+	/*get pTop*/
+	uint8_t *ptop			= &pPred[-kiStride];
+	const uint8_t kuiT0		= *ptop;
+	const uint8_t kuiT1		= *(ptop+1);
+	const uint8_t kuiT2		= *(ptop+2);
+	const uint8_t kuiT3		= *(ptop+3);
+	const uint16_t kuiT01	= 1 + kuiT0 + kuiT1;
+	const uint16_t kuiT12	= 1 + kuiT1 + kuiT2;
+	const uint16_t kuiT23	= 1 + kuiT2 + kuiT3;
+	const uint16_t kuiT33	= 1 + (kuiT3 << 1);
+	const uint8_t kuiDLT0	= (kuiT01 + kuiT12) >> 2;	// kDLT0
+	const uint8_t kuiDLT1	= (kuiT12 + kuiT23) >> 2;	// kDLT1
+	const uint8_t kuiDLT2	= (kuiT23 + kuiT33) >> 2;	// kDLT2
+	const uint8_t kuiDLT3	= kuiT33 >> 1;			// kDLT3
+	const uint8_t kuiList[8]= { kuiDLT0, kuiDLT1, kuiDLT2, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3 ,kuiDLT3 };
+
+	ST32( pPred,           LD32(kuiList  ) );
+	ST32( pPred+kiStride,  LD32(kuiList+1) );
+	ST32( pPred+kiStride2, LD32(kuiList+2) );
+	ST32( pPred+kiStride3, LD32(kuiList+3) );	
+}
+
+
+/*down right*/
+void_t WelsI4x4LumaPredDDR_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	= kiStride<<1;
+	const int32_t kiStride3	= kiStride + kiStride2;
+	uint8_t *ptopleft		= &pPred[-(kiStride+1)];
+	uint8_t *pleft			= &pPred[-1];
+	const uint8_t kuiLT		= *ptopleft;
+	/*get pLeft and pTop*/
+	const uint8_t kuiL0		= *pleft;
+	const uint8_t kuiL1		= *(pleft+kiStride );
+	const uint8_t kuiL2		= *(pleft+kiStride2);
+	const uint8_t kuiL3		= *(pleft+kiStride3);
+	const uint8_t kuiT0		= *(ptopleft+1);
+	const uint8_t kuiT1		= *(ptopleft+2);
+	const uint8_t kuiT2		= *(ptopleft+3);
+	const uint8_t kuiT3		= *(ptopleft+4);
+	const uint16_t kuiTL0	= 1 + kuiLT + kuiL0;
+	const uint16_t kuiLT0	= 1 + kuiLT + kuiT0;
+	const uint16_t kuiT01	= 1 + kuiT0 + kuiT1;
+	const uint16_t kuiT12	= 1 + kuiT1 + kuiT2;
+	const uint16_t kuiT23	= 1 + kuiT2 + kuiT3;
+	const uint16_t kuiL01	= 1 + kuiL0 + kuiL1;
+	const uint16_t kuiL12	= 1 + kuiL1 + kuiL2;
+	const uint16_t kuiL23	= 1 + kuiL2 + kuiL3;
+	const uint8_t kuiDDR0	= (kuiTL0 + kuiLT0) >> 2;	// kuiDDR0
+	const uint8_t kuiDDR1	= (kuiLT0 + kuiT01) >> 2;	// kuiDDR1
+	const uint8_t kuiDDR2	= (kuiT01 + kuiT12) >> 2;	// kuiDDR2
+	const uint8_t kuiDDR3	= (kuiT12 + kuiT23) >> 2;	// kuiDDR3
+	const uint8_t kuiDDR4	= (kuiTL0 + kuiL01) >> 2;	// kuiDDR4
+	const uint8_t kuiDDR5	= (kuiL01 + kuiL12) >> 2;	// kuiDDR5
+	const uint8_t kuiDDR6	= (kuiL12 + kuiL23) >> 2;	// kuiDDR6
+	const uint8_t kuiList[8]= { kuiDDR6, kuiDDR5, kuiDDR4, kuiDDR0, kuiDDR1, kuiDDR2, kuiDDR3, 0	};
+
+	ST32( pPred          , LD32(kuiList+3) );
+	ST32( pPred+kiStride , LD32(kuiList+2) );
+	ST32( pPred+kiStride2, LD32(kuiList+1) );
+	ST32( pPred+kiStride3, LD32(kuiList  ) );
+}
+
+
+/*vertical pLeft*/
+void_t WelsI4x4LumaPredVL_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	= kiStride<<1;
+	const int32_t kiStride3	= kiStride + kiStride2;
+	uint8_t *ptopleft		= &pPred[-(kiStride+1)];
+	/*get pTop*/
+	const uint8_t kuiT0		    = *(ptopleft+1);
+	const uint8_t kuiT1		    = *(ptopleft+2);
+	const uint8_t kuiT2		    = *(ptopleft+3);
+	const uint8_t kuiT3		    = *(ptopleft+4);
+	const uint8_t kuiT4		    = *(ptopleft+5);
+	const uint8_t kuiT5		    = *(ptopleft+6);
+	const uint8_t kuiT6		    = *(ptopleft+7);
+	const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
+	const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
+	const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
+	const uint16_t kuiT34		= 1 + kuiT3 + kuiT4;
+	const uint16_t kuiT45		= 1 + kuiT4 + kuiT5;
+	const uint16_t kuiT56		= 1 + kuiT5 + kuiT6;
+	const uint8_t kuiVL0		= kuiT01 >> 1;			// kuiVL0
+	const uint8_t kuiVL1		= kuiT12 >> 1;			// kuiVL1
+	const uint8_t kuiVL2		= kuiT23 >> 1;			// kuiVL2
+	const uint8_t kuiVL3		= kuiT34 >> 1;			// kuiVL3
+	const uint8_t kuiVL4		= kuiT45 >> 1;			// kuiVL4
+	const uint8_t kuiVL5		= (kuiT01 + kuiT12) >> 2;	// kuiVL5
+	const uint8_t kuiVL6		= (kuiT12 + kuiT23) >> 2;	// kuiVL6
+	const uint8_t kuiVL7		= (kuiT23 + kuiT34) >> 2;	// kuiVL7
+	const uint8_t kuiVL8		= (kuiT34 + kuiT45) >> 2;	// kuiVL8
+	const uint8_t kuiVL9		= (kuiT45 + kuiT56) >> 2;	// kuiVL9
+	const uint8_t kuiList[10]	= { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL8, kuiVL9 };
+
+	ST32( pPred,           LD32(kuiList  ) );
+	ST32( pPred+kiStride,  LD32(kuiList+5) );
+	ST32( pPred+kiStride2, LD32(kuiList+1) );
+	ST32( pPred+kiStride3, LD32(kuiList+6) );	
+}
+
+/*vertical pLeft*/
+void_t WelsI4x4LumaPredVLTop_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	    = kiStride<<1;
+	const int32_t kiStride3	    = kiStride + kiStride2;
+	uint8_t *ptopleft		    = &pPred[-(kiStride+1)];
+	/*get pTop*/
+	const uint8_t kuiT0		    = *(ptopleft+1);
+	const uint8_t kuiT1		    = *(ptopleft+2);
+	const uint8_t kuiT2		    = *(ptopleft+3);
+	const uint8_t kuiT3		    = *(ptopleft+4);
+	const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
+	const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
+	const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
+	const uint16_t kuiT33		= 1 + (kuiT3 << 1);
+	const uint8_t kuiVL0		= kuiT01 >> 1;
+	const uint8_t kuiVL1		= kuiT12 >> 1;
+	const uint8_t kuiVL2		= kuiT23 >> 1;
+	const uint8_t kuiVL3		= kuiT33 >> 1;
+	const uint8_t kuiVL4		= (kuiT01 + kuiT12) >> 2;
+	const uint8_t kuiVL5		= (kuiT12 + kuiT23) >> 2;
+	const uint8_t kuiVL6		= (kuiT23 + kuiT33) >> 2;
+	const uint8_t kuiVL7		= kuiVL3;
+	const uint8_t kuiList[10]	= { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL7 };
+
+	ST32( pPred          , LD32(kuiList  ) );
+	ST32( pPred+kiStride , LD32(kuiList+5) );
+	ST32( pPred+kiStride2, LD32(kuiList+1) );
+	ST32( pPred+kiStride3, LD32(kuiList+6) );	
+}
+
+
+/*vertical right*/
+void_t WelsI4x4LumaPredVR_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	    = kiStride<<1;
+	const int32_t kiStride3	    = kiStride + kiStride2;
+	const uint8_t kuiLT		    = pPred[-kiStride-1];
+	/*get pLeft and pTop*/
+	const uint8_t kuiL0		    = pPred[         -1];
+	const uint8_t kuiL1		    = pPred[kiStride -1];
+	const uint8_t kuiL2		    = pPred[kiStride2-1];
+	const uint8_t kuiT0		    = pPred[ -kiStride];
+	const uint8_t kuiT1		    = pPred[1-kiStride];
+	const uint8_t kuiT2		    = pPred[2-kiStride];
+	const uint8_t kuiT3		    = pPred[3-kiStride];
+	const uint8_t kuiVR0		= (1 + kuiLT + kuiT0)>>1;	// kuiVR0
+	const uint8_t kuiVR1		= (1 + kuiT0 + kuiT1)>>1;	// kuiVR1
+	const uint8_t kuiVR2		= (1 + kuiT1 + kuiT2)>>1;	// kuiVR2
+	const uint8_t kuiVR3		= (1 + kuiT2 + kuiT3)>>1;	// kuiVR3
+	const uint8_t kuiVR4		= (2 + kuiL0 + (kuiLT<<1) + kuiT0)>>2;	// kuiVR4
+	const uint8_t kuiVR5		= (2 + kuiLT + (kuiT0<<1) + kuiT1)>>2;	// kuiVR5
+	const uint8_t kuiVR6		= (2 + kuiT0 + (kuiT1<<1) + kuiT2)>>2;	// kuiVR6
+	const uint8_t kuiVR7		= (2 + kuiT1 + (kuiT2<<1) + kuiT3)>>2;	// kuiVR7
+	const uint8_t kuiVR8		= (2 + kuiLT + (kuiL0<<1) + kuiL1)>>2;	// kuiVR8
+	const uint8_t kuiVR9		= (2 + kuiL0 + (kuiL1<<1) + kuiL2)>>2;	// kuiVR9
+	const uint8_t kuiList[10]	= { kuiVR8, kuiVR0, kuiVR1, kuiVR2, kuiVR3, kuiVR9, kuiVR4, kuiVR5, kuiVR6, kuiVR7 };
+
+	ST32( pPred          , LD32(kuiList+1) );
+	ST32( pPred+kiStride , LD32(kuiList+6) );
+	ST32( pPred+kiStride2, LD32(kuiList  ) );
+	ST32( pPred+kiStride3, LD32(kuiList+5) );	
+}
+
+/*horizontal up*/
+void_t WelsI4x4LumaPredHU_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2	    = kiStride<<1;
+	const int32_t kiStride3	    = kiStride + kiStride2;
+	/*get pLeft*/
+	const uint8_t kuiL0		    = pPred[         -1];
+	const uint8_t kuiL1		    = pPred[kiStride -1];
+	const uint8_t kuiL2		    = pPred[kiStride2-1];
+	const uint8_t kuiL3		    = pPred[kiStride3-1];
+	const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
+	const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
+	const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
+	const uint8_t kuiHU0		= kuiL01 >> 1;
+	const uint8_t kuiHU1		= (kuiL01 + kuiL12) >> 2;
+	const uint8_t kuiHU2		= kuiL12 >> 1;
+	const uint8_t kuiHU3		= (kuiL12 + kuiL23) >> 2;
+	const uint8_t kuiHU4		= kuiL23 >> 1;
+	const uint8_t kuiHU5		= (1 + kuiL23 + (kuiL3<<1)) >> 2;
+	const uint8_t kuiList[10]	= { kuiHU0, kuiHU1, kuiHU2, kuiHU3, kuiHU4, kuiHU5, kuiL3, kuiL3, kuiL3, kuiL3 };
+
+	ST32( pPred          , LD32(kuiList  ) );
+	ST32( pPred+kiStride , LD32(kuiList+2) );
+	ST32( pPred+kiStride2, LD32(kuiList+4) );
+	ST32( pPred+kiStride3, LD32(kuiList+6) );	
+}
+
+/*horizontal down*/
+void_t WelsI4x4LumaPredHD_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiStride2 	= kiStride<<1;
+	const int32_t kiStride3	    = kiStride + kiStride2;
+	const uint8_t kuiLT		    = pPred[-(kiStride+1)];
+	/*get pLeft and pTop*/
+	const uint8_t kuiL0		    = pPred[-1          ];
+	const uint8_t kuiL1		    = pPred[-1+kiStride ];
+	const uint8_t kuiL2		    = pPred[-1+kiStride2];
+	const uint8_t kuiL3		    = pPred[-1+kiStride3];
+	const uint8_t kuiT0		    = pPred[-kiStride   ];
+	const uint8_t kuiT1		    = pPred[-kiStride+1 ];
+	const uint8_t kuiT2		    = pPred[-kiStride+2 ];
+	const uint16_t kuiTL0		= 1 + kuiLT + kuiL0;
+	const uint16_t kuiLT0		= 1 + kuiLT + kuiT0;
+	const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
+	const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
+	const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
+	const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
+	const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
+	const uint8_t kuiHD0		= kuiTL0 >> 1;
+	const uint8_t kuiHD1		= (kuiTL0 + kuiLT0) >> 2;
+	const uint8_t kuiHD2		= (kuiLT0 + kuiT01) >> 2;
+	const uint8_t kuiHD3		= (kuiT01 + kuiT12) >> 2;
+	const uint8_t kuiHD4		= kuiL01 >> 1;
+	const uint8_t kuiHD5		= (kuiTL0 + kuiL01) >> 2;
+	const uint8_t kuiHD6		= kuiL12 >> 1;
+	const uint8_t kuiHD7		= (kuiL01 + kuiL12) >> 2;
+	const uint8_t kuiHD8		= kuiL23 >> 1;
+	const uint8_t kuiHD9	    = (kuiL12 + kuiL23) >> 2;
+	const uint8_t kuiList[10]	= { kuiHD8, kuiHD9, kuiHD6, kuiHD7, kuiHD4, kuiHD5, kuiHD0, kuiHD1, kuiHD2, kuiHD3 };
+
+	ST32( pPred          , LD32(kuiList+6) );
+	ST32( pPred+kiStride , LD32(kuiList+4) );
+	ST32( pPred+kiStride2, LD32(kuiList+2) );
+	ST32( pPred+kiStride3, LD32(kuiList  ) );	
+}
+
+void_t WelsIChromaPredV_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const uint64_t kuiVal64	= LD64(&pPred[-kiStride]);
+	const int32_t kiStride2	= kiStride  << 1;
+	const int32_t kiStride4 = kiStride2 << 1;
+
+	ST64( pPred                        , kuiVal64 );
+	ST64( pPred+kiStride               , kuiVal64 );
+	ST64( pPred+kiStride2              , kuiVal64 );
+	ST64( pPred+kiStride2+kiStride     , kuiVal64 );
+	ST64( pPred+kiStride4              , kuiVal64 );
+	ST64( pPred+kiStride4+kiStride     , kuiVal64 );
+	ST64( pPred+kiStride4+kiStride2    , kuiVal64 );
+	ST64( pPred+(kiStride<<3)-kiStride , kuiVal64 );
+}
+
+void_t WelsIChromaPredH_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t iTmp = (kiStride<<3)-kiStride;
+	uint8_t i = 7;
+	
+	do
+	{
+		const uint8_t kuiVal8	= pPred[iTmp-1];
+		const uint64_t kuiVal64	= 0x0101010101010101ULL * kuiVal8;
+
+		ST64( pPred+iTmp, kuiVal64 );
+
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+
+void_t WelsIChromaPredPlane_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t a=0, b=0, c=0, H=0, V=0;
+	int32_t i, j;
+	uint8_t *pTop = &pPred[-kiStride];
+	uint8_t *pLeft = &pPred[-1];
+
+	for(i = 0 ; i < 4 ; i ++)
+	{
+		H += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
+		V += (i + 1) * (pLeft[(4 + i)*kiStride] - pLeft[(2 - i)*kiStride]);
+	}
+
+	a = (pLeft[7*kiStride] + pTop[7]) << 4;
+	b = (17 * H + 16) >> 5;
+	c = (17 * V + 16) >> 5;
+
+	for(i = 0 ; i < 8 ; i ++)
+	{
+		for(j = 0 ; j < 8 ; j ++)
+		{
+			int32_t iTmp = (a + b * (j - 3) + c * (i - 3) + 16) >> 5;
+			iTmp = WELS_CLIP1(iTmp);
+			pPred[j] = iTmp;
+		}
+		pPred += kiStride;
+	}
+}
+
+
+void_t WelsIChromaPredDc_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiL1		= kiStride-1;
+	const int32_t kiL2		= kiL1 + kiStride;
+	const int32_t kiL3		= kiL2 + kiStride;
+	const int32_t kiL4		= kiL3 + kiStride;
+	const int32_t kiL5		= kiL4 + kiStride;
+	const int32_t kiL6		= kiL5 + kiStride;
+	const int32_t kiL7		= kiL6 + kiStride;	
+	/*caculate the kMean value*/
+	const uint8_t kuiM1		= ( pPred[-kiStride] + pPred[1-kiStride] + pPred[2-kiStride] + pPred[3-kiStride] +
+								pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 4) >> 3 ;
+	const uint32_t kuiSum2	= pPred[4-kiStride] + pPred[5-kiStride] + pPred[6-kiStride] + pPred[7-kiStride];
+	const uint32_t kuiSum3	= pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7];
+	const uint8_t kuiM2		= (kuiSum2 + 2) >> 2;
+	const uint8_t kuiM3		= (kuiSum3 + 2) >> 2;
+	const uint8_t kuiM4		= (kuiSum2 + kuiSum3 + 4) >> 3;
+	const uint8_t kuiMUP[8]	= {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
+	const uint8_t kuiMDown[8]	= {kuiM3, kuiM3, kuiM3, kuiM3, kuiM4, kuiM4, kuiM4, kuiM4};
+	const uint64_t kuiUP64		= LD64(kuiMUP);
+	const uint64_t kuiDN64		= LD64(kuiMDown);
+
+	ST64( pPred       , kuiUP64 );
+	ST64( pPred+kiL1+1, kuiUP64 );
+	ST64( pPred+kiL2+1, kuiUP64 );
+	ST64( pPred+kiL3+1, kuiUP64 );
+	ST64( pPred+kiL4+1, kuiDN64 );
+	ST64( pPred+kiL5+1, kuiDN64 );
+	ST64( pPred+kiL6+1, kuiDN64 );
+	ST64( pPred+kiL7+1, kuiDN64 );
+}
+
+void_t WelsIChromaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const int32_t kiL1	=   -1 + kiStride;
+	const int32_t kiL2	= kiL1 + kiStride;
+	const int32_t kiL3	= kiL2 + kiStride;
+	const int32_t kiL4	= kiL3 + kiStride;
+	const int32_t kiL5	= kiL4 + kiStride;
+	const int32_t kiL6	= kiL5 + kiStride;
+	const int32_t kiL7	= kiL6 + kiStride;	
+	/*caculate the kMean value*/
+	const uint8_t kuiMUP   = (pPred[-1] + pPred[kiL1] + pPred[kiL2] + pPred[kiL3] + 2)>>2 ;
+	const uint8_t kuiMDown = (pPred[kiL4] + pPred[kiL5] + pPred[kiL6] + pPred[kiL7] + 2)>>2;
+	const uint64_t kuiUP64 = 0x0101010101010101ULL * kuiMUP;
+	const uint64_t kuiDN64 = 0x0101010101010101ULL * kuiMDown;
+
+	ST64( pPred       , kuiUP64 );
+	ST64( pPred+kiL1+1, kuiUP64 );
+	ST64( pPred+kiL2+1, kuiUP64 );
+	ST64( pPred+kiL3+1, kuiUP64 );
+	ST64( pPred+kiL4+1, kuiDN64 );
+	ST64( pPred+kiL5+1, kuiDN64 );
+	ST64( pPred+kiL6+1, kuiDN64 );
+	ST64( pPred+kiL7+1, kuiDN64 );
+}
+
+void_t WelsIChromaPredDcTop_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t iTmp			= (kiStride<<3)-kiStride;
+	/*caculate the kMean value*/
+	const uint8_t kuiM1	    = (pPred[-kiStride] + pPred[1-kiStride] + pPred[2-kiStride] + pPred[3-kiStride]+2)>>2;
+	const uint8_t kuiM2	    = (pPred[4-kiStride] + pPred[5-kiStride] + pPred[6-kiStride] + pPred[7-kiStride] + 2)>>2;
+	const uint8_t kuiM[8]	= {kuiM1, kuiM1, kuiM1, kuiM1, kuiM2, kuiM2, kuiM2, kuiM2};
+
+	uint8_t i = 7;
+	
+	do
+	{
+		ST64( pPred+iTmp, LD64(kuiM) );
+
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+void_t WelsIChromaPredDcNA_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t iTmp = (kiStride<<3)-kiStride;
+	const uint64_t kuiDC64 = 0x8080808080808080ULL;
+	uint8_t i = 7;
+	
+	do
+	{
+		ST64( pPred+iTmp, kuiDC64 );
+
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+void_t WelsI16x16LumaPredV_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t iTmp			= (kiStride<<4)-kiStride;
+	const uint64_t kuiTop1	= LD64(pPred-kiStride);
+	const uint64_t kuiTop2  = LD64(pPred-kiStride+8);
+	uint8_t i = 15;	
+	
+	do
+	{
+		ST64( pPred+iTmp  , kuiTop1 );
+		ST64( pPred+iTmp+8, kuiTop2 );
+
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+void_t WelsI16x16LumaPredH_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t iTmp = (kiStride<<4)-kiStride;
+	uint8_t i = 15;
+	
+	do
+	{
+		const uint8_t kuiVal8	= pPred[iTmp-1];
+		const uint64_t kuiVal64	= 0x0101010101010101ULL * kuiVal8;
+
+		ST64( pPred+iTmp  , kuiVal64 );
+		ST64( pPred+iTmp+8, kuiVal64 );
+
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+void_t WelsI16x16LumaPredPlane_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t a=0, b=0, c=0, H=0, V=0;
+	int32_t i, j;
+	uint8_t *pTop = &pPred[-kiStride];
+	uint8_t *pLeft = &pPred[-1];
+
+	for(i = 0 ; i < 8 ; i ++)
+	{
+		H += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
+		V += (i + 1) * (pLeft[(8 + i)*kiStride] - pLeft[(6 - i)*kiStride]);
+	}
+
+	a = (pLeft[15*kiStride] + pTop[15]) << 4;
+	b = (5 * H + 32) >> 6;
+	c = (5 * V + 32) >> 6;
+
+	for(i = 0 ; i < 16 ; i ++)
+	{
+		for(j = 0 ; j < 16 ; j ++)
+		{
+			int32_t iTmp = (a + b * (j - 7) + c * (i - 7) + 16) >> 5;
+			iTmp = WELS_CLIP1(iTmp);
+			pPred[j] = iTmp;
+		}
+		pPred += kiStride;
+	}
+}
+
+void_t WelsI16x16LumaPredDc_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t iTmp = (kiStride<<4)-kiStride;
+	int32_t iSum = 0;
+	uint8_t i = 15;
+	uint8_t uiMean = 0;
+
+	/*caculate the kMean value*/
+	do
+	{
+		iSum += pPred[-1+iTmp] + pPred[-kiStride+i];
+		iTmp -= kiStride;
+	}while(i-->0);
+	uiMean = ( 16 + iSum ) >> 5;
+
+	iTmp = (kiStride<<4)-kiStride;
+	i = 15;
+	do
+	{
+		memset(&pPred[iTmp], uiMean, I16x16_COUNT);
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+
+void_t WelsI16x16LumaPredDcTop_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t iTmp = (kiStride<<4)-kiStride;
+	int32_t iSum = 0;
+	uint8_t i = 15;
+	uint8_t uiMean = 0;
+	
+	/*caculate the kMean value*/
+	do
+	{
+		iSum += pPred[-kiStride+i];
+	}while(i-->0);
+	uiMean = ( 8 + iSum ) >> 4;
+
+	i = 15;
+	do
+	{
+		memset(&pPred[iTmp], uiMean, I16x16_COUNT);
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+void_t WelsI16x16LumaPredDcLeft_c(uint8_t *pPred, const int32_t kiStride)
+{
+	int32_t iTmp = (kiStride<<4)-kiStride;
+	int32_t iSum = 0;
+	uint64_t uiMean64 = 0;
+	uint8_t uiMean = 0;
+	uint8_t i = 15;	
+
+	/*caculate the kMean value*/
+	do
+	{
+		iSum += pPred[-1+iTmp];
+		iTmp -= kiStride;
+	}while(i-->0);
+	uiMean	= ( 8 + iSum ) >> 4;
+	uiMean64	= 0x0101010101010101ULL * uiMean;
+
+	iTmp = (kiStride<<4)-kiStride;
+	i = 15;
+	do
+	{
+		ST64( pPred+iTmp  , uiMean64 );
+		ST64( pPred+iTmp+8, uiMean64 );
+
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+void_t WelsI16x16LumaPredDcNA_c(uint8_t *pPred, const int32_t kiStride)
+{
+	const uint64_t kuiDC64 = 0x8080808080808080ULL;
+	int32_t iTmp = (kiStride<<4)-kiStride;
+	uint8_t i = 15;	
+	
+	do
+	{
+		ST64( pPred+iTmp, kuiDC64 );
+		ST64( pPred+iTmp+8, kuiDC64 );
+
+		iTmp -= kiStride;
+	}while(i-->0);
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/manage_dec_ref.cpp
@@ -1,0 +1,575 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  manage_ref_pic.cpp
+ *
+ *  Abstract
+ *      Implementation for managing reference picture
+ *
+ *  History
+ *      07/21/2008 Created
+ *
+ *****************************************************************************/
+#include <string.h>
+
+#include "manage_dec_ref.h"
+#include "error_code.h"
+#include "utils.h"
+#include "as264_common.h" // for LTR macro can be delete later
+
+namespace WelsDec {
+
+static void_t SetUnRef(PPicture pRef)
+{
+    if( NULL != pRef)
+    {
+	    pRef->bUsedAsRef = false;
+	    pRef->bIsLongRef = false;
+	    pRef->iFrameNum = -1;
+	    pRef->iFramePoc = 0;
+	    pRef->iLongTermFrameIdx = -1;
+	    pRef->bRefBaseFlag = 0;
+	    pRef->uiQualityId = -1;	
+	    pRef->uiTemporalId = -1;
+	    pRef->uiSpatialId = -1;
+	    pRef->iSpsId = -1;
+    }
+}
+
+//reset pRefList when
+// 1.sps arrived that is new sequence starting
+// 2.IDR NAL i.e. 1st layer in IDR AU
+
+void_t WelsResetRefPic(PWelsDecoderContext pCtx)
+{
+	int32_t i = 0;
+	PRefPic pRefPic = &pCtx->sRefPic;
+	pCtx->sRefPic.uiLongRefCount[0] = pCtx->sRefPic.uiShortRefCount[0] = 0;
+
+	pRefPic->uiRefCount[LIST_0]	= 0;
+	
+	for(i=0; i < MAX_SHORT_REF_COUNT; i++)	{
+		if ( pRefPic->pShortRefList[LIST_0][i] != NULL){	
+			SetUnRef(pRefPic->pShortRefList[LIST_0][i]);
+			pRefPic->pShortRefList[LIST_0][i] = NULL;
+		}
+	}
+	pRefPic->uiShortRefCount[LIST_0] = 0;
+
+	for(i=0; i < MAX_LONG_REF_COUNT; i++){
+		if (pRefPic->pLongRefList[LIST_0][i] != NULL)	{	
+			SetUnRef(pRefPic->pLongRefList[LIST_0][i]);
+			pRefPic->pLongRefList[LIST_0][i] = NULL;
+		}
+	}
+	pRefPic->uiLongRefCount[LIST_0] = 0;
+}
+
+/**
+ * fills the pRefPic.pRefList.
+ */
+int32_t WelsInitRefList(PWelsDecoderContext pCtx, int32_t iPoc)
+{
+	int32_t i,j, iCount=0;
+	const bool_t kbUseRefBasePicFlag = pCtx->pCurDqLayer->bUseRefBasePicFlag;
+	PPicture* ppShoreRefList = pCtx->sRefPic.pShortRefList[LIST_0];
+	PPicture* ppLongRefList  = pCtx->sRefPic.pLongRefList[LIST_0];
+	memset(pCtx->sRefPic.pRefList[LIST_0],0,MAX_REF_PIC_COUNT*sizeof(PPicture));
+	//short
+	for(i=0; i<pCtx->sRefPic.uiShortRefCount[LIST_0]; ++i){	
+		if( kbUseRefBasePicFlag == ppShoreRefList[i]->bRefBaseFlag ) {
+			pCtx->sRefPic.pRefList[LIST_0][iCount++ ]= ppShoreRefList[i];	
+		}else{
+			for ( j = 0;j<pCtx->sRefPic.uiShortRefCount[LIST_0];++j)
+			{
+				if (ppShoreRefList[j]->iFrameNum == ppShoreRefList[i]->iFrameNum && ppShoreRefList[j]->bRefBaseFlag == kbUseRefBasePicFlag)
+				{
+					break;
+				}
+			}
+			if (j == pCtx->sRefPic.uiShortRefCount[LIST_0])
+			{
+				pCtx->sRefPic.pRefList[LIST_0][iCount++] = ppShoreRefList[i];
+			}
+		}
+	}
+				
+	//long
+	j = 0;
+	for(i=0; i< pCtx->sRefPic.uiLongRefCount[LIST_0] ; ++i){
+		if(kbUseRefBasePicFlag == ppLongRefList[i]->bRefBaseFlag){
+			pCtx->sRefPic.pRefList[LIST_0][iCount++  ]= ppLongRefList[i];
+		}else{
+			for ( j = 0;j<pCtx->sRefPic.uiLongRefCount[LIST_0];++j)
+			{
+				if (ppLongRefList[j]->iLongTermFrameIdx == ppLongRefList[i]->iLongTermFrameIdx && ppLongRefList[j]->bRefBaseFlag == kbUseRefBasePicFlag)
+				{
+					break;
+				}
+			}
+			if (j == pCtx->sRefPic.uiLongRefCount[LIST_0])
+			{
+				pCtx->sRefPic.pRefList[LIST_0][iCount++] = ppLongRefList[i];
+			}
+		}
+	}
+	pCtx->sRefPic.uiRefCount[LIST_0] = iCount;	
+
+   return ERR_NONE;
+}
+
+int32_t WelsReorderRefList(PWelsDecoderContext pCtx)
+{
+	PRefPicListReorderSyn pRefPicListReorderSyn = pCtx->pCurDqLayer->pRefPicListReordering;
+	PNalUnitHeaderExt pNalHeaderExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;
+	PSliceHeader pSliceHeader = &pCtx->pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader;
+	PPicture pPic = NULL;
+	PPicture* ppRefList = pCtx->sRefPic.pRefList[LIST_0];
+	int32_t iRefCount = pCtx->sRefPic.uiRefCount[LIST_0];
+	int32_t iPredFrameNum = pSliceHeader->iFrameNum;
+	int32_t iMaxPicNum = 1<<pSliceHeader->pSps->uiLog2MaxFrameNum;
+	int32_t iAbsDiffPicNum = -1;
+	int32_t iReorderingIndex = 0;
+	int32_t i = 0;
+
+	if(pCtx->eSliceType == I_SLICE || pCtx->eSliceType == SI_SLICE)	{	
+		return ERR_NONE;	
+	}
+
+	if ( iRefCount <= 0 )
+	{
+		pCtx->iErrorCode = dsNoParamSets; //No any reference for decoding, SHOULD request IDR
+		return ERR_INFO_REFERENCE_PIC_LOST;
+	}
+
+	if (pRefPicListReorderSyn->bRefPicListReorderingFlag[LIST_0]){
+		while (pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiReorderingOfPicNumsIdc != 3)
+		{
+			uint16_t uiReorderingOfPicNumsIdc = pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiReorderingOfPicNumsIdc;
+			if (uiReorderingOfPicNumsIdc <2){
+				iAbsDiffPicNum = pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiAbsDiffPicNumMinus1 + 1;
+
+				if (uiReorderingOfPicNumsIdc == 0){	
+					iPredFrameNum -= iAbsDiffPicNum;
+				}else{	
+					iPredFrameNum += iAbsDiffPicNum;	
+				}
+				iPredFrameNum &= iMaxPicNum-1;
+
+				for( i= iRefCount-1; i>=iReorderingIndex; i--){
+					if (ppRefList[i]->iFrameNum == iPredFrameNum && !ppRefList[i]->bIsLongRef)
+					{
+						if( ( pNalHeaderExt->uiQualityId == ppRefList[i]->uiQualityId ) && ( pSliceHeader->iSpsId != ppRefList[i]->iSpsId ) )//check;
+						{
+							WelsLog( pCtx, WELS_LOG_WARNING, "WelsReorderRefList()::::BASE LAYER::::iSpsId:%d, ref_sps_id:%d\n",pSliceHeader->iSpsId, ppRefList[i]->iSpsId );						
+							pCtx->iErrorCode = dsNoParamSets;	//cross-IDR reference frame selection, SHOULD request IDR.--
+							return ERR_INFO_REFERENCE_PIC_LOST;
+						}else{
+							break;
+						}
+					}
+				}
+		
+			}else if (uiReorderingOfPicNumsIdc == 2){
+				for(  i = iRefCount -1; i>=iReorderingIndex; i--){
+					if( ppRefList[i]->bIsLongRef && ppRefList[i]->iLongTermFrameIdx == pRefPicListReorderSyn->sReorderingSyn[LIST_0][iReorderingIndex].uiLongTermPicNum )
+					{
+						if ( ( pNalHeaderExt->uiQualityId == ppRefList[i]->uiQualityId ) && ( pSliceHeader->iSpsId != ppRefList[i]->iSpsId ) )//check;
+						{
+							WelsLog( pCtx, WELS_LOG_WARNING, "WelsReorderRefList()::::BASE LAYER::::iSpsId:%d, ref_sps_id:%d\n",pSliceHeader->iSpsId, ppRefList[i]->iSpsId );						
+							pCtx->iErrorCode = dsNoParamSets;	//cross-IDR reference frame selection, SHOULD request IDR.--
+							return ERR_INFO_REFERENCE_PIC_LOST;
+						}else{
+							break;
+						}
+					}
+				}
+			}
+			if (i < 0)	{	
+				return ERR_INFO_REFERENCE_PIC_LOST;
+			}
+			pPic = ppRefList[i];
+			memmove(&ppRefList[1+iReorderingIndex], &ppRefList[iReorderingIndex], (i-iReorderingIndex)*sizeof(PPicture));//confirmed_safe_unsafe_usage
+			ppRefList[iReorderingIndex]= pPic;
+			iReorderingIndex++;
+		}
+	}
+	return ERR_NONE;
+}
+
+int32_t WelsMarkAsRef(PWelsDecoderContext pCtx, const bool_t kbRefBaseMarkingFlag)
+{
+	PRefPic pRefPic = &pCtx->sRefPic;
+	PRefPicMarking pRefPicMarking = pCtx->pCurDqLayer->pRefPicMarking;
+	PRefBasePicMarking pRefPicBaseMarking =pCtx->pCurDqLayer->pRefPicBaseMarking;
+	PAccessUnit pCurAU = pCtx->pAccessUnitList;
+	bool_t bIsIDRAU = FALSE;
+	uint32_t j;
+
+	int32_t iRet = ERR_NONE;
+	if(pCtx->pCurDqLayer->bStoreRefBasePicFlag && (pCtx->pSps->iNumRefFrames<2)){
+		return ERR_INFO_INVALID_MMCO_REF_NUM_NOT_ENOUGH;
+	}
+	
+	pCtx->pDec->bUsedAsRef = TRUE;
+	pCtx->pDec->uiQualityId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiQualityId;
+	pCtx->pDec->uiTemporalId = pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt.uiTemporalId;
+	pCtx->pDec->bRefBaseFlag = kbRefBaseMarkingFlag;
+
+	for( j = pCurAU->uiStartPos; j <= pCurAU->uiEndPos; j++ ) {
+		if (pCurAU->pNalUnitsList[j]->sNalHeaderExt.sNalUnitHeader.eNalUnitType== NAL_UNIT_CODED_SLICE_IDR||	pCurAU->pNalUnitsList[j]->sNalHeaderExt.bIdrFlag) {
+			bIsIDRAU = TRUE;
+			break;
+		}
+	}
+	if(bIsIDRAU){
+		if (pRefPicMarking->bLongTermRefFlag){
+			pCtx->sRefPic.iMaxLongTermFrameIdx = 0;
+			AddLongTermToList(pRefPic,pCtx->pDec,0);
+		}else{	
+			pCtx->sRefPic.iMaxLongTermFrameIdx = -1;
+		}
+	}else{
+		if (pRefPicBaseMarking->bAdaptiveRefBasePicMarkingModeFlag){
+			iRet = MMCOBase(pCtx,pRefPicBaseMarking);
+		}
+
+		if (iRet != ERR_NONE){
+			return iRet;
+		}
+
+		if (pRefPicMarking->bAdaptiveRefPicMarkingModeFlag){
+			iRet = MMCO(pCtx,pRefPicMarking);
+            if( pCtx->bLastHasMmco5 )
+            {
+                pCtx->pDec->iFrameNum = 0;
+                pCtx->pDec->iFramePoc = 0;
+            }
+			if (pRefPic->uiLongRefCount[LIST_0]+pRefPic->uiShortRefCount[LIST_0] > pCtx->pSps->iNumRefFrames){
+				return ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW;
+			}
+		}else{	
+			iRet = SlidingWindow(pCtx);
+		}
+	}
+	
+	if (!pCtx->pDec->bIsLongRef){
+		AddShortTermToList(pRefPic,pCtx->pDec);
+	}
+
+	return iRet;
+}
+
+static int32_t MMCOBase(PWelsDecoderContext pCtx,PRefBasePicMarking pRefPicBaseMarking)
+{
+	PSps pSps = pCtx->pCurDqLayer->sLayerInfo.pSps;
+	int32_t i = 0;
+	int32_t iRet = ERR_NONE;
+
+	for ( i = 0 ; pRefPicBaseMarking->mmco_base[i].uiMmcoType != MMCO_END; i++){
+		uint32_t uiMmcoType = pRefPicBaseMarking->mmco_base[i].uiMmcoType;			
+		int32_t iShortFrameNum = (pCtx->iFrameNum - pRefPicBaseMarking->mmco_base[i].uiDiffOfPicNums) &((1<<pSps->uiLog2MaxFrameNum)-1);
+		uint32_t uiLongTermPicNum = pRefPicBaseMarking->mmco_base[i].uiLongTermPicNum;
+		if ( uiMmcoType > MMCO_LONG2UNUSED)	{
+			return ERR_INFO_INVALID_MMCO_OPCODE_BASE;
+		}
+		iRet = MMCOProcess(pCtx,uiMmcoType,TRUE,iShortFrameNum,uiLongTermPicNum,0,0);
+
+		if (iRet != ERR_NONE){
+			return iRet;
+		}
+	}
+
+	return ERR_NONE;
+}
+
+static int32_t MMCO(PWelsDecoderContext pCtx,PRefPicMarking pRefPicMarking)
+{
+	PSps pSps = pCtx->pCurDqLayer->sLayerInfo.pSps;
+	int32_t i = 0;
+	int32_t iRet = ERR_NONE;
+	for ( i = 0; pRefPicMarking->sMmcoRef[i].uiMmcoType != MMCO_END; i++){
+		uint32_t uiMmcoType = pRefPicMarking->sMmcoRef[i].uiMmcoType;
+		int32_t iShortFrameNum = (pCtx->iFrameNum - pRefPicMarking->sMmcoRef[i].iDiffOfPicNum) & ((1<<pSps->uiLog2MaxFrameNum)-1);
+		uint32_t uiLongTermPicNum = pRefPicMarking->sMmcoRef[i].uiLongTermPicNum;
+		int32_t iLongTermFrameIdx = pRefPicMarking->sMmcoRef[i].iLongTermFrameIdx;
+		int32_t iMaxLongTermFrameIdx = pRefPicMarking->sMmcoRef[i].iMaxLongTermFrameIdx;
+		if ( uiMmcoType > MMCO_LONG)	{
+			return ERR_INFO_INVALID_MMCO_OPCODE_BASE;
+		}
+		iRet = MMCOProcess(pCtx,uiMmcoType,FALSE,iShortFrameNum,uiLongTermPicNum,iLongTermFrameIdx,iMaxLongTermFrameIdx);
+		if (iRet != ERR_NONE){
+			return iRet;
+		}
+	}
+
+	return ERR_NONE;
+}
+static int32_t MMCOProcess( PWelsDecoderContext pCtx,uint32_t uiMmcoType,bool_t bRefBasePic,
+                           int32_t iShortFrameNum,uint32_t uiLongTermPicNum ,int32_t iLongTermFrameIdx,int32_t iMaxLongTermFrameIdx )
+{
+	PRefPic pRefPic = &pCtx->sRefPic;
+	PPicture pPic = NULL;
+	int32_t i = 0;
+	int32_t iRet = ERR_NONE;
+
+	switch (uiMmcoType)
+	{
+	case MMCO_SHORT2UNUSED:
+		pPic = WelsDelShortFromListSetUnref(pRefPic,iShortFrameNum,(ERemoveFlag) bRefBasePic);
+		break;
+	case MMCO_LONG2UNUSED:
+		pPic = WelsDelLongFromListSetUnref(pRefPic,uiLongTermPicNum,(ERemoveFlag) bRefBasePic);
+		break;
+	case MMCO_SHORT2LONG:
+		if(iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx){	
+			return ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX;
+		}
+		pPic = WelsDelShortFromList(pRefPic,iShortFrameNum,REMOVE_TARGET); 
+		WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_TARGET);
+
+		WelsDelShortFromList(pRefPic,iShortFrameNum,REMOVE_BASE); 			
+		WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_BASE);
+#ifdef LONG_TERM_REF
+		pCtx->bCurAuContainLtrMarkSeFlag = true;
+		pCtx->iFrameNumOfAuMarkedLtr      = iShortFrameNum;
+		WelsLog( pCtx, WELS_LOG_INFO, "ex_mark_avc():::MMCO_SHORT2LONG:::LTR marking....iFrameNum: %d\n", pCtx->iFrameNumOfAuMarkedLtr );
+#endif
+
+		MarkAsLongTerm(pRefPic,iShortFrameNum,iLongTermFrameIdx);
+		break;
+	case MMCO_SET_MAX_LONG:
+		pRefPic->iMaxLongTermFrameIdx = iMaxLongTermFrameIdx;
+		for (i = 0 ;i <pRefPic->uiLongRefCount[LIST_0];i++) {
+			if (pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx) {
+				WelsDelLongFromListSetUnref(pRefPic,pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx,REMOVE_BASE_FIRST);		
+			}
+		}
+		break;
+	case MMCO_RESET:
+		WelsResetRefPic(pCtx);
+        pCtx->bLastHasMmco5 = true;
+		break;
+	case MMCO_LONG:
+		if(iLongTermFrameIdx > pRefPic->iMaxLongTermFrameIdx){	
+			return ERR_INFO_INVALID_MMCO_LONG_TERM_IDX_EXCEED_MAX; 
+		}
+#ifdef LONG_TERM_REF
+		pCtx->bCurAuContainLtrMarkSeFlag = true;
+		pCtx->iFrameNumOfAuMarkedLtr      = pCtx->iFrameNum;
+		WelsLog( pCtx, WELS_LOG_INFO, "ex_mark_avc():::MMCO_LONG:::LTR marking....iFrameNum: %d\n", pCtx->iFrameNum );
+#endif
+		WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_TARGET);
+		WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_BASE);
+		iRet = AddLongTermToList(pRefPic,pCtx->pDec,iLongTermFrameIdx);
+		break;
+	default :
+		break;
+	}
+
+	return iRet;
+}
+
+static int32_t SlidingWindow( PWelsDecoderContext pCtx )
+{
+	PRefPic pRefPic = &pCtx->sRefPic;
+	PPicture pPic = NULL;
+	int32_t i = 0;
+
+	if (pCtx->sRefPic.uiShortRefCount[LIST_0] +pCtx->sRefPic.uiLongRefCount[LIST_0] >= pCtx->pSps->iNumRefFrames){	
+		for ( i = pRefPic->uiShortRefCount[LIST_0] -1;i>=0;i--){
+			pPic = WelsDelShortFromList(pRefPic,pRefPic->pShortRefList[LIST_0][i]->iFrameNum,REMOVE_BASE_FIRST);
+			if (pPic){	
+				SetUnRef(pPic);
+				break;
+			}else{
+				return ERR_INFO_INVALID_MMCO_REF_NUM_OVERFLOW;
+			}
+		}
+	}
+	return ERR_NONE;
+}
+
+static PPicture WelsDelShortFromList(PRefPic pRefPic, int32_t iFrameNum, ERemoveFlag eRemoveFlag)
+{
+	int32_t i = 0;
+	int32_t iMoveSize = 0;
+	PPicture pPic = NULL;
+
+	for(i=0; i<pRefPic->uiShortRefCount[LIST_0]; i++){
+		if( pRefPic->pShortRefList[LIST_0][i]->iFrameNum == iFrameNum)
+		{
+			if(   ( eRemoveFlag == REMOVE_TARGET && !pRefPic->pShortRefList[LIST_0][i]->bRefBaseFlag )	
+				||( eRemoveFlag == REMOVE_BASE && pRefPic->pShortRefList[LIST_0][i]->bRefBaseFlag) 
+				||(eRemoveFlag == REMOVE_BASE_FIRST ) )
+			{
+				iMoveSize = pRefPic->uiShortRefCount[LIST_0] - i - 1;
+				pRefPic->pShortRefList[LIST_0][i]->bUsedAsRef = false;
+				pPic = pRefPic->pShortRefList[LIST_0][i];
+				pRefPic->pShortRefList[LIST_0][i]= NULL;
+				if (iMoveSize > 0){	
+					memmove(&pRefPic->pShortRefList[LIST_0][i], &pRefPic->pShortRefList[LIST_0][i+1], iMoveSize * sizeof(PPicture));//confirmed_safe_unsafe_usage
+				}
+				pRefPic->uiShortRefCount[LIST_0]--;
+				pRefPic->pShortRefList[LIST_0][pRefPic->uiShortRefCount[0]] = NULL;
+				break;
+			}
+		}
+	}
+
+	return pPic;
+}
+
+static PPicture WelsDelShortFromListSetUnref(PRefPic pRefPic, int32_t iFrameNum, ERemoveFlag eRemoveFlag)
+{
+	PPicture pPic = WelsDelShortFromList(pRefPic,iFrameNum,eRemoveFlag);
+	if (pPic){	
+		SetUnRef(pPic);
+	}
+	return pPic;
+}
+
+static PPicture WelsDelLongFromList(PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag)
+{
+	PPicture pPic = NULL;
+	int32_t i = 0;
+	for ( i = 0;i<pRefPic->uiLongRefCount[LIST_0];i++)
+	{
+		pPic = pRefPic->pLongRefList[LIST_0][i];
+		if ( pPic->iLongTermFrameIdx == (int32_t)uiLongTermFrameIdx)
+		{
+			if( ((eRemoveFlag == REMOVE_TARGET) && !(pPic->bRefBaseFlag)) || ((eRemoveFlag == REMOVE_BASE) && pPic->bRefBaseFlag) )
+			{
+				int32_t iMoveSize = pRefPic->uiLongRefCount[LIST_0] - i - 1;
+				pPic->bUsedAsRef = FALSE;
+				pPic->bIsLongRef = FALSE;
+				if (iMoveSize > 0){	
+					memmove(&pRefPic->pLongRefList[LIST_0][i], &pRefPic->pLongRefList[LIST_0][i+1], iMoveSize * sizeof(PPicture));//confirmed_safe_unsafe_usage
+				}
+				pRefPic->uiLongRefCount[LIST_0]--;
+				pRefPic->pLongRefList[LIST_0][pRefPic->uiLongRefCount[LIST_0]] = NULL;
+				return pPic;
+			}
+		}
+	}
+	return NULL;
+}
+
+static PPicture WelsDelLongFromListSetUnref(PRefPic pRefPic, uint32_t uiLongTermFrameIdx, ERemoveFlag eRemoveFlag)
+{
+	PPicture pPic = WelsDelLongFromList(pRefPic,uiLongTermFrameIdx,eRemoveFlag);
+	if (pPic){
+		SetUnRef(pPic);
+	}
+	return pPic;
+}
+
+static int32_t AddShortTermToList(PRefPic pRefPic,PPicture pPic)
+{
+	pPic->bUsedAsRef = TRUE;
+	pPic->bIsLongRef = FALSE;
+	pPic->iLongTermFrameIdx = -1;
+	if (pRefPic->uiShortRefCount[LIST_0]>0)	{
+		memmove(&pRefPic->pShortRefList[LIST_0][1],&pRefPic->pShortRefList[LIST_0][0],pRefPic->uiShortRefCount[LIST_0]*sizeof(PPicture));//confirmed_safe_unsafe_usage
+	}
+	pRefPic->pShortRefList[LIST_0][0] = pPic;
+	pRefPic->uiShortRefCount[LIST_0]++;
+	return ERR_NONE;
+}
+
+static int32_t AddLongTermToList(PRefPic pRefPic,PPicture pPic, int32_t iLongTermFrameIdx)
+{
+	int32_t i = 0;
+
+	pPic->bUsedAsRef = TRUE;
+	pPic->bIsLongRef = TRUE;
+	pPic->iLongTermFrameIdx = iLongTermFrameIdx;
+	if (pRefPic->uiLongRefCount[LIST_0] == 0){
+		pRefPic->pLongRefList[LIST_0][pRefPic->uiLongRefCount[LIST_0]] = pPic;
+	}else if (pRefPic->uiLongRefCount[LIST_0] >0){
+		for ( i = 0; i<pRefPic->uiLongRefCount[LIST_0];i++){
+			if (pRefPic->pLongRefList[LIST_0][i]->iLongTermFrameIdx > pPic->iLongTermFrameIdx)	{	
+				break;
+			}
+		}
+		memmove(&pRefPic->pLongRefList[LIST_0][i+1],&pRefPic->pLongRefList[LIST_0][i],(pRefPic->uiLongRefCount[LIST_0]-i)*sizeof(PPicture));//confirmed_safe_unsafe_usage
+		pRefPic->pLongRefList[LIST_0][i] = pPic;	
+	}else{
+		return ERR_INFO_REF_COUNT_OVERFLOW;
+	}
+
+
+	pRefPic->uiLongRefCount[LIST_0]++;
+	return ERR_NONE;
+}
+
+static int32_t AssignLongTermIdx(PRefPic pRefPic,int32_t iFrameNum,int32_t iLongTermFrameIdx )
+{
+	PPicture pPic = NULL;
+	int32_t iRet = ERR_NONE;
+	WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_TARGET);
+	WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_BASE);
+
+	pPic = WelsDelShortFromList(pRefPic,iFrameNum,REMOVE_TARGET);
+	if (pPic){
+		iRet = AddLongTermToList(pRefPic,pPic,iLongTermFrameIdx);
+	}else{	
+		return ERR_INFO_INVALID_REF_MARKING;	
+	}
+	
+	pPic = NULL;
+	pPic = WelsDelShortFromList(pRefPic,iFrameNum,REMOVE_BASE);
+	if (pPic){	
+		iRet = AddLongTermToList(pRefPic,pPic,iLongTermFrameIdx);	
+	}
+
+	return iRet;
+}
+
+static int32_t MarkAsLongTerm( PRefPic pRefPic,int32_t iFrameNum, int32_t iLongTermFrameIdx )
+{
+	PPicture pPic = NULL;
+	int32_t i = 0;
+	int32_t iRet = ERR_NONE;
+	WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_TARGET);
+	WelsDelLongFromListSetUnref(pRefPic,iLongTermFrameIdx,REMOVE_BASE);
+
+	for (i = 0; i<pRefPic->uiRefCount[LIST_0];i++)	{
+		pPic = pRefPic->pRefList[LIST_0][i];
+		if ( pPic->iFrameNum == iFrameNum && !pPic->bIsLongRef){
+			iRet = AddLongTermToList(pRefPic,pPic,iLongTermFrameIdx);
+		}
+	}
+	
+	return iRet;
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/mc.cpp
@@ -1,0 +1,757 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mc.c
+ *
+ * \brief	Interfaces implementation for motion compensation
+ *
+ * \date	03/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "as264_common.h"
+#include "mc.h"
+
+#include "cpu_core.h"
+
+namespace WelsDec {
+
+/*------------------weight for chroma fraction pixel interpolation------------------*/
+//iA = (8 - dx) * (8 - dy);   
+//iB = dx * (8 - dy);   
+//iC = (8 - dx) * dy;
+//iD = dx * dy
+static const uint8_t g_kuiABCD[8][8][4] =	//g_kA[dy][dx], g_kB[dy][dx], g_kC[dy][dx], g_kD[dy][dx]
+{
+	{	
+		{64, 0, 0, 0},{56, 8, 0, 0},{48, 16, 0, 0},{40, 24, 0, 0},
+		{32, 32, 0, 0},{24, 40, 0, 0},{16, 48, 0, 0},{8, 56, 0, 0}
+	},
+	{	
+		{56, 0, 8, 0},{49, 7, 7, 1},{42, 14, 6, 2},{35, 21, 5, 3},
+		{28, 28, 4, 4},{21, 35, 3, 5},{14, 42, 2, 6},{7, 49, 1, 7}
+	},
+	{	
+		{48, 0, 16, 0},{42, 6, 14, 2},{36, 12, 12, 4},{30, 18, 10, 6},
+		{24, 24, 8, 8},{18, 30, 6, 10},{12, 36, 4, 12},{6, 42, 2, 14}
+	},
+	{	
+		{40, 0, 24, 0},{35, 5, 21, 3},{30, 10, 18, 6},{25, 15, 15, 9},
+		{20, 20, 12, 12},{15, 25, 9, 15},{10, 30, 6, 18},{5, 35, 3, 21}
+	},
+	{	
+		{32, 0, 32, 0},{28, 4, 28, 4},{24, 8, 24, 8},{20, 12, 20, 12},
+		{16, 16, 16, 16},{12, 20, 12, 20},{8, 24, 8, 24},{4, 28, 4, 28}
+	},
+	{	
+		{24, 0, 40, 0},{21, 3, 35, 5},{18, 6, 30, 10},{15, 9, 25, 15},
+		{12, 12, 20, 20},{9, 15, 15, 25},{6, 18, 10, 30},{3, 21, 5, 35}
+	},
+	{	
+		{16, 0, 48, 0},{14, 2, 42, 6},{12, 4, 36, 12},{10, 6, 30, 18},
+		{8, 8, 24, 24},{6, 10, 18, 30},{4, 12, 12, 36},{2, 14, 6, 42}
+	},
+	{	
+		{8, 0, 56, 0},{7, 1, 49, 7},{6, 2, 42, 14},{5, 3, 35, 21},
+		{4, 4, 28, 28},{3, 5, 21, 35},{2, 6, 14, 42},{1, 7, 7, 49}
+	}
+};
+
+typedef void_t (*PWelsMcWidthHeightFunc)(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight);
+
+//***************************************************************************//
+//                          C code implementation                            //
+//***************************************************************************//
+static inline void_t McCopyWidthEq2_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int32_t i;
+	for (i = 0; i < iHeight; i++)// iWidth == 2 only for chroma
+	{
+		ST16(pDst, LD16(pSrc));
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+
+static inline void_t McCopyWidthEq4_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int32_t i;
+	for (i = 0; i < iHeight; i++)
+	{
+		ST32(pDst, LD32(pSrc));
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+
+static inline void_t McCopyWidthEq8_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int32_t i;
+	for (i = 0; i < iHeight; i++)
+	{
+		ST64(pDst, LD64(pSrc));
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+
+static inline void_t McCopyWidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int32_t i;
+	for (i = 0; i < iHeight; i++)
+	{
+		ST64(pDst  , LD64(pSrc));
+		ST64(pDst+8, LD64(pSrc+8));
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+
+//--------------------Luma sample MC------------------//
+
+static inline int32_t HorFilterInput16bit_c(int16_t* pSrc)
+{
+	int32_t iPix05 = pSrc[-2] + pSrc[3];
+	int32_t iPix14 = pSrc[-1] + pSrc[2];
+	int32_t iPix23 = pSrc[ 0] + pSrc[1];
+	
+	return (iPix05 - ((iPix14<<2)+iPix14) + (iPix23<<4) + (iPix23<<2));
+}
+// h: iOffset=1 / v: iOffset=iSrcStride
+static inline int32_t FilterInput8bitWithStride_c(uint8_t* pSrc, const int32_t kiOffset)
+{
+	const int32_t kiOffset1 = kiOffset;
+	const int32_t kiOffset2 = (kiOffset << 1);
+	const int32_t kiOffset3 = kiOffset + kiOffset2;
+	const uint32_t kuiPix05   = *(pSrc - kiOffset2) + *(pSrc + kiOffset3);
+	const uint32_t kuiPix14   = *(pSrc - kiOffset1) + *(pSrc + kiOffset2);
+	const uint32_t kuiPix23   = *(pSrc           ) + *(pSrc + kiOffset1);
+
+	return (kuiPix05 - ((kuiPix14<<2)+kuiPix14) + (kuiPix23<<4) + (kuiPix23<<2));
+}
+
+static inline void_t PixelAvg_c(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+										uint8_t* pSrcB, int32_t iSrcBStride, int32_t iWidth, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < iWidth; j++) 
+		{
+			pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+		}
+		pDst  += iDstStride;
+		pSrcA += iSrcAStride;
+		pSrcB += iSrcBStride;
+	}
+}
+static inline void_t McCopy_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	if (iWidth == 16)
+		McCopyWidthEq16_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth == 8)
+		McCopyWidthEq8_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth == 4)
+		McCopyWidthEq4_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
+	else //here iWidth == 2
+		McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
+}
+
+static inline void_t McHorVer20_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++) 
+	{
+		for (j = 0; j < iWidth; j++)
+		{
+			pDst[j] = WELS_CLIP1((FilterInput8bitWithStride_c(pSrc+j,1)+16)>>5);
+		}
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+
+static inline void_t McHorVer02_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < iWidth; j++) 
+		{
+			pDst[j] = WELS_CLIP1((FilterInput8bitWithStride_c(pSrc+j, iSrcStride)+16)>>5);
+		}
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+
+static inline void_t McHorVer22_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	int16_t iTmp[16+5] = {0}; //16
+	int32_t i, j, k;
+
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < iWidth + 5; j++)
+		{
+			iTmp[j] = FilterInput8bitWithStride_c(pSrc-2+j, iSrcStride);
+		}
+		for (k = 0; k < iWidth; k++)
+		{
+			pDst[k] = WELS_CLIP1((HorFilterInput16bit_c(&iTmp[2+k])+512)>>10);
+		}		
+		pSrc += iSrcStride;
+		pDst += iDstStride;
+	}
+}
+
+/////////////////////luma MC////////////////////////// 
+static inline void_t McHorVer01_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiTmp[256] = { 0 };
+	McHorVer02_c(pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);	
+	PixelAvg_c(pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer03_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiTmp[256] = { 0 };
+	McHorVer02_c(pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);	
+	PixelAvg_c(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, uiTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer10_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiTmp[256] = { 0 };
+	McHorVer20_c(pSrc, iSrcStride, uiTmp, 16, iWidth, iHeight);	
+	PixelAvg_c(pDst, iDstStride, pSrc, iSrcStride, uiTmp, 16,iWidth, iHeight);
+}
+static inline void_t McHorVer11_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiHorTmp[256] = { 0 };
+	uint8_t uiVerTmp[256] = { 0 };
+	McHorVer20_c(pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+	McHorVer02_c(pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer12_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiVerTmp[256] = { 0 };
+	uint8_t uiCtrTmp[256] = { 0 };
+	McHorVer02_c(pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+	McHorVer22_c(pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer13_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiHorTmp[256] = { 0 };
+	uint8_t uiVerTmp[256] = { 0 };
+	McHorVer20_c(pSrc+iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+	McHorVer02_c(pSrc, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer21_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{	
+	uint8_t uiHorTmp[256] = { 0 };
+	uint8_t uiCtrTmp[256] = { 0 };
+	McHorVer20_c(pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+	McHorVer22_c(pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer23_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{	
+	uint8_t uiHorTmp[256] = { 0 };
+	uint8_t uiCtrTmp[256] = { 0 };
+	McHorVer20_c(pSrc+iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+	McHorVer22_c(pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer30_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiHorTmp[256] = { 0 };
+	McHorVer20_c(pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, pSrc+1, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer31_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiHorTmp[256] = { 0 };
+	uint8_t uiVerTmp[256] = { 0 };
+	McHorVer20_c(pSrc, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+	McHorVer02_c(pSrc+1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer32_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiVerTmp[256] = { 0 };
+	uint8_t uiCtrTmp[256] = { 0 };
+	McHorVer02_c(pSrc+1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+	McHorVer22_c(pSrc, iSrcStride, uiCtrTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, uiVerTmp, 16, uiCtrTmp, 16, iWidth, iHeight);
+}
+static inline void_t McHorVer33_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	uint8_t uiHorTmp[256] = { 0 };
+	uint8_t uiVerTmp[256] = { 0 };
+	McHorVer20_c(pSrc+iSrcStride, iSrcStride, uiHorTmp, 16, iWidth, iHeight);
+	McHorVer02_c(pSrc+1, iSrcStride, uiVerTmp, 16, iWidth, iHeight);
+	PixelAvg_c(pDst, iDstStride, uiHorTmp, 16, uiVerTmp, 16, iWidth, iHeight);
+}
+
+void_t McLuma_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+			      int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+				  //pSrc has been added the offset of mv
+{
+    PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]   
+    {
+		{McCopy_c,      McHorVer01_c, McHorVer02_c, McHorVer03_c},
+        {McHorVer10_c,  McHorVer11_c, McHorVer12_c, McHorVer13_c},
+        {McHorVer20_c,  McHorVer21_c, McHorVer22_c, McHorVer23_c},
+        {McHorVer30_c,  McHorVer31_c, McHorVer32_c, McHorVer33_c},
+    };
+
+    pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+static inline void_t McChromaWithFragMv_c( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight )
+{
+	int32_t i, j;
+	int32_t iA, iB, iC, iD;
+	uint8_t* pSrcNext = pSrc + iSrcStride;
+	const uint32_t kuiABCD = *((uint32_t *)g_kuiABCD[iMvY&0x07][iMvX&0x07]);
+	iA = (kuiABCD      ) & 0xff;
+	iB = (kuiABCD >>  8) & 0xff;
+	iC = (kuiABCD >> 16) & 0xff;
+	iD = (kuiABCD >> 24) & 0xff;
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < iWidth; j++)
+		{
+			pDst[j] = (iA * pSrc[j] + iB * pSrc[j+1] + iC * pSrcNext[j] + iD * pSrcNext[j+1] + 32) >> 6;
+		}
+		pDst     += iDstStride;
+		pSrc      = pSrcNext;
+		pSrcNext += iSrcStride;
+	}
+}
+
+void_t McChroma_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+			        int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+					//pSrc has been added the offset of mv
+{
+	const int32_t kiD8x = iMvX&0x07;
+	const int32_t kiD8y = iMvY&0x07;
+	if (0 == kiD8x && 0 == kiD8y)
+		McCopy_c(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+	else
+		McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+}
+
+#if defined(X86_ASM)
+//***************************************************************************//
+//                       SSE2 implement                          //
+//***************************************************************************//
+static inline void_t McHorVer22WidthEq8_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
+{
+	ENFORCE_STACK_ALIGN_2D(int16_t, iTap, 21, 8, 16)
+	McHorVer22Width8HorFirst_sse2(pSrc-2, iSrcStride, (uint8_t *)iTap,16,iHeight+5);
+	McHorVer22VerLast_sse2((uint8_t *)iTap,16, pDst, iDstStride, 8, iHeight);
+}
+
+static inline void_t McHorVer02WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
+{
+	McHorVer02WidthEq8_sse2( pSrc,     iSrcStride, pDst,     iDstStride, iHeight );
+	McHorVer02WidthEq8_sse2( &pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight );
+}
+
+static inline void_t McHorVer22WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
+{
+	McHorVer22WidthEq8_sse2( pSrc,     iSrcStride, pDst,     iDstStride, iHeight );
+	McHorVer22WidthEq8_sse2( &pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight );
+}
+
+static inline void_t McCopy_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	if (iWidth == 16)
+		McCopyWidthEq16_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth == 8)
+		McCopyWidthEq8_mmx(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth ==4)
+		McCopyWidthEq4_mmx(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
+	else
+		McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
+}
+
+static inline void_t McHorVer20_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	if (iWidth == 16)
+		McHorVer20WidthEq16_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth == 8)
+		McHorVer20WidthEq8_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else
+		McHorVer20WidthEq4_mmx(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+}
+
+static inline void_t McHorVer02_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	if (iWidth == 16)
+		McHorVer02WidthEq16_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth == 8)
+		McHorVer02WidthEq8_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else
+		McHorVer02_c(pSrc,iSrcStride,pDst,iDstStride, 4, iHeight);
+}
+
+static inline void_t McHorVer22_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	if (iWidth == 16)
+		McHorVer22WidthEq16_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth == 8)
+		McHorVer22WidthEq8_sse2(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else
+		McHorVer22_c(pSrc,iSrcStride,pDst,iDstStride,4, iHeight);
+}
+
+static inline void_t McHorVer01_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer02WidthEq16_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer02WidthEq8_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer02_c(pSrc, iSrcStride, pTmp, 16, 4, iHeight);	
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer03_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer02WidthEq16_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer02WidthEq8_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer02_c(pSrc, iSrcStride, pTmp, 16, 4, iHeight);	
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, pTmp, 16, iHeight);
+	}	
+}
+static inline void_t McHorVer10_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer20WidthEq16_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pTmp, 16, iHeight);	
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pTmp, 16, iHeight);	
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer11_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer20WidthEq16_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02WidthEq16_sse2(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2  (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02WidthEq8_sse2(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02_c     (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+		PixelAvgWidthEq4_mmx  (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer12_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer02WidthEq16_sse2(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+		McHorVer22WidthEq16_sse2(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2  (pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer02WidthEq8_sse2(pSrc, iSrcStride, pVerTmp, 16, iHeight);
+		McHorVer22WidthEq8_sse2(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer02_c   (pSrc, iSrcStride, pVerTmp, 16, 4, iHeight);
+		McHorVer22_c   (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer13_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+	if (iWidth ==16)
+	{
+		McHorVer20WidthEq16_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02WidthEq16_sse2(pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2  (pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer20WidthEq8_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02WidthEq8_sse2(pSrc,            iSrcStride, pVerTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);		
+	}
+	else
+	{
+		McHorVer20WidthEq4_mmx(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02_c     (pSrc,            iSrcStride, pVerTmp, 16, 4 ,iHeight);
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);		
+	}
+}
+static inline void_t McHorVer21_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer20WidthEq16_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer22WidthEq16_sse2(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer22WidthEq8_sse2(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer22_c     (pSrc, iSrcStride, pCtrTmp, 16, 4, iHeight);
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer23_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{	
+	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer20WidthEq16_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer22WidthEq16_sse2(pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer20WidthEq8_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer22WidthEq8_sse2(pSrc,            iSrcStride, pCtrTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer20WidthEq4_mmx(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer22_c     (pSrc,            iSrcStride, pCtrTmp, 16, 4, iHeight);
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pCtrTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer30_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer20WidthEq16_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pSrc+1, iSrcStride, pHorTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pSrc+1, iSrcStride, pHorTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pSrc+1, iSrcStride, pHorTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer31_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer20WidthEq16_sse2(pSrc,   iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02WidthEq16_sse2(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer20WidthEq8_sse2(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02WidthEq8_sse2(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer20WidthEq4_mmx(pSrc, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02_c(pSrc+1, iSrcStride, pVerTmp, 16, 4, iHeight);
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer32_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
+	if (iWidth ==16)
+	{
+		McHorVer02WidthEq16_sse2(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+		McHorVer22WidthEq16_sse2(pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer02WidthEq8_sse2(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
+		McHorVer22WidthEq8_sse2(pSrc,   iSrcStride, pCtrTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer02_c(pSrc+1, iSrcStride, pVerTmp, 16, 4, iHeight);
+		McHorVer22_c(pSrc,   iSrcStride, pCtrTmp, 16, 4, iHeight);
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pVerTmp, 16, pCtrTmp, 16, iHeight);
+	}
+}
+static inline void_t McHorVer33_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	FORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
+	FORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
+	if (iWidth == 16)
+	{
+		McHorVer20WidthEq16_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02WidthEq16_sse2(pSrc+1,          iSrcStride, pVerTmp, 16, iHeight);
+		PixelAvgWidthEq16_sse2(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+	else if(iWidth == 8)
+	{
+		McHorVer20WidthEq8_sse2(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02WidthEq8_sse2(pSrc+1,          iSrcStride, pVerTmp, 16, iHeight);
+		PixelAvgWidthEq8_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+	else
+	{
+		McHorVer20WidthEq4_mmx(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
+		McHorVer02_c     (pSrc+1,          iSrcStride, pVerTmp, 16, 4, iHeight);
+		PixelAvgWidthEq4_mmx(pDst, iDstStride, pHorTmp, 16, pVerTmp, 16, iHeight);
+	}
+}
+
+void_t McLuma_sse2(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+				  int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
+				  //pSrc has been added the offset of mv
+{
+	PWelsMcWidthHeightFunc pWelsMcFunc[4][4] =  //[x][y]   
+	{
+		{McCopy_sse2,     McHorVer01_sse2, McHorVer02_sse2, McHorVer03_sse2},
+		{McHorVer10_sse2, McHorVer11_sse2, McHorVer12_sse2, McHorVer13_sse2},
+		{McHorVer20_sse2, McHorVer21_sse2, McHorVer22_sse2, McHorVer23_sse2},
+		{McHorVer30_sse2, McHorVer31_sse2, McHorVer32_sse2, McHorVer33_sse2},
+	};
+
+	pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+}
+
+void_t McChroma_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+					   int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight )
+{
+	static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] =
+	{
+		McChromaWidthEq4_mmx,
+		McChromaWidthEq8_sse2
+	};
+	const int32_t kiD8x = iMvX&0x07;
+	const int32_t kiD8y = iMvY&0x07;
+	if (kiD8x ==0 && kiD8y ==0)
+	{
+		McCopy_sse2(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+		return;
+	}
+	if (iWidth != 2)
+	{
+		kpMcChromaWidthFuncs[iWidth>>3](pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+	}
+	else
+		McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
+}
+
+
+#endif //X86_ASM
+
+void_t InitMcFunc(SMcFunc *pMcFunc, int32_t iCpu)
+{
+	pMcFunc->pMcLumaFunc   = McLuma_c;
+	pMcFunc->pMcChromaFunc = McChroma_c; 
+	
+#if defined (X86_ASM)
+	if ( iCpu & WELS_CPU_SSE2 )
+	{
+		pMcFunc->pMcLumaFunc   = McLuma_sse2;
+		pMcFunc->pMcChromaFunc = McChroma_sse2;
+	}
+#endif //(X86_ASM)	
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/mem_align.cpp
@@ -1,0 +1,115 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "mem_align.h"
+#include "crt_util_safe_x.h"
+
+namespace WelsDec {
+
+//#define MEMORY_CHECK
+#ifdef MEMORY_CHECK
+
+WelsFileHandle * pMemCheckMalloc = NULL;
+WelsFileHandle * pMemCheckFree = NULL; 
+
+int32_t iCountMalloc = 0;
+#endif
+//
+
+/////////////////////////////////////////////////////////////////////////////////
+#define ALIGNBYTES (16)
+/////////////////////////////////////////////////////////////////////////////////
+
+void_t * WelsMalloc( const uint32_t kuiSize, const str_t *kpTag )
+{
+	const int32_t kiSizeVoidPtr	= sizeof( void_t ** );
+	const int32_t kiSizeInt		= sizeof( int32_t );
+#ifdef HAVE_CACHE_LINE_ALIGN
+	const int32_t kiAlignBytes	= ALIGNBYTES - 1;
+#else
+	const int32_t kiAlignBytes	= 15;
+#endif// HAVE_CACHE_LINE_ALIGN
+	uint8_t* pBuf		= (uint8_t *) malloc( kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt );
+	uint8_t* pAlignBuf;
+
+#ifdef MEMORY_CHECK	
+	if( pMemCheckMalloc == NULL ){
+		pMemCheckMalloc = WelsFopen(".\\mem_check_malloc.txt", "at+");
+		pMemCheckFree   = WelsFopen(".\\mem_check_free.txt", "at+");
+	}
+
+	if ( kpTag != NULL )
+	{
+		if ( pMemCheckMalloc != NULL )
+		{
+			fprintf( pMemCheckMalloc, "0x%x, size: %d       , malloc %s\n", (void_t *)pBuf, (kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt), kpTag );			
+		}
+		if ( pMemCheckMalloc != NULL )
+		{
+			fflush( pMemCheckMalloc );
+		}
+	}
+#endif	
+
+	if ( NULL == pBuf )
+		return NULL;
+
+	// to fill zero values
+	memset( pBuf, 0, kuiSize + kiAlignBytes + kiSizeVoidPtr + kiSizeInt );
+
+	pAlignBuf = pBuf + kiAlignBytes + kiSizeVoidPtr + kiSizeInt;
+	pAlignBuf -= (int32_t) pAlignBuf & kiAlignBytes;
+	*( (void_t **) ( pAlignBuf - kiSizeVoidPtr ) ) = pBuf;
+	*( (int32_t *) ( pAlignBuf - (kiSizeVoidPtr + kiSizeInt) ) ) = kuiSize;
+
+	return (pAlignBuf);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void_t WelsFree( void_t* pPtr, const str_t *kpTag )
+{
+	if( pPtr )
+	{
+#ifdef MEMORY_CHECK			
+		if ( NULL != pMemCheckFree && kpTag != NULL )
+		{				
+			fprintf( pMemCheckFree, "0x%x, free %s\n", (void_t *)(*( ( ( void_t **) pPtr ) - 1 )), kpTag );
+			fflush( pMemCheckFree );
+		}	
+#endif
+		free( *( ( ( void_t **) pPtr ) - 1 ) );
+	}
+}
+
+/////////////////////////////////////////////////////////////////////////////
+} // namespace WelsDec
--- /dev/null
+++ b/codec/decoder/core/src/memmgr_nal_unit.cpp
@@ -1,0 +1,153 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  memmgr_nal_unit.c
+ *
+ *  Abstract
+ *      memory manager utils for NAL Unit list available
+ *
+ *  History
+ *      07/10/2008 Created
+ *
+ *****************************************************************************/
+#include "memmgr_nal_unit.h"
+#include "utils.h"
+#include "mem_align.h"
+
+namespace WelsDec {
+
+int32_t MemInitNalList(PAccessUnit *ppAu, const uint32_t kuiSize){
+	uint32_t uiIdx = 0;
+	uint8_t *pBase = NULL, *pPtr = NULL;
+	const uint32_t kuiSizeAu = sizeof(SAccessUnit);
+	const uint32_t kuiSizeNalUnitPtr= kuiSize*sizeof(PNalUnit);
+	const uint32_t kuiSizeNalUnit = sizeof(SNalUnit);
+	const uint32_t kuiCountSize = (kuiSizeAu + kuiSizeNalUnitPtr + kuiSize * kuiSizeNalUnit) * sizeof(uint8_t);
+	
+	if (kuiSize == 0)
+		return 1;
+
+	if ( *ppAu != NULL ){
+		MemFreeNalList(ppAu);
+	}
+
+	pBase = (uint8_t *)WelsMalloc( kuiCountSize, "Access Unit" );
+	if ( pBase == NULL )
+		return 1;
+	pPtr = pBase;
+	*ppAu = (PAccessUnit)pPtr;
+	pPtr += kuiSizeAu;
+	(*ppAu)->pNalUnitsList	= (PNalUnit*)pPtr;	
+	pPtr += kuiSizeNalUnitPtr;
+	do {
+		(*ppAu)->pNalUnitsList[uiIdx] = (PNalUnit)pPtr;
+		pPtr += kuiSizeNalUnit;
+		++ uiIdx;
+	} while(uiIdx < kuiSize);
+
+	(*ppAu)->uiCountUnitsNum	= kuiSize;
+	(*ppAu)->uiAvailUnitsNum	= 0;
+	(*ppAu)->uiActualUnitsNum	= 0;
+	(*ppAu)->uiEndPos		    = 0;
+	(*ppAu)->bCompletedAuFlag	= false;	
+
+	return 0;
+}
+
+int32_t MemFreeNalList(PAccessUnit *ppAu)
+{
+	if (ppAu != NULL){
+		PAccessUnit pAu = *ppAu;
+		if (pAu != NULL)
+		{			
+			WelsFree(pAu, "Access Unit");
+			*ppAu = NULL;
+		}		
+	}
+	return 0;
+}
+
+
+int32_t ExpandNalUnitList(PAccessUnit *ppAu, const int32_t kiOrgSize, const int32_t kiExpSize)
+{
+	if ( kiExpSize <= kiOrgSize )
+		return 1;
+	else
+	{
+		PAccessUnit pTmp = NULL;
+		int32_t iIdx = 0;
+
+		if ( MemInitNalList( &pTmp, kiExpSize ) )	// request new list with expanding
+			return 1;
+
+		do
+		{
+			memcpy(pTmp->pNalUnitsList[iIdx], (*ppAu)->pNalUnitsList[iIdx], sizeof(SNalUnit) );//confirmed_safe_unsafe_usage
+			++ iIdx;
+		}while(iIdx < kiOrgSize);		
+
+		pTmp->uiCountUnitsNum	= kiExpSize;
+		pTmp->uiAvailUnitsNum	= (*ppAu)->uiAvailUnitsNum;
+		pTmp->uiActualUnitsNum	= (*ppAu)->uiActualUnitsNum;
+		pTmp->uiEndPos		    = (*ppAu)->uiEndPos;
+		pTmp->bCompletedAuFlag	= (*ppAu)->bCompletedAuFlag;
+
+		MemFreeNalList( ppAu );	// free old list
+		*ppAu = pTmp;
+		return 0;
+	}
+}
+
+/*
+ *	MemGetNextNal
+ *	Get next NAL Unit for using.
+ *	Need expand NAL Unit list if exceeding count number of available NAL Units withing an Access Unit
+ */
+PNalUnit MemGetNextNal(PAccessUnit *ppAu){	
+	PAccessUnit pAu = *ppAu;
+	PNalUnit pNu = NULL;
+	
+	if (pAu->uiAvailUnitsNum >= pAu->uiCountUnitsNum)	// need expand list
+	{
+		const uint32_t kuiExpandingSize = pAu->uiCountUnitsNum + (MAX_NAL_UNIT_NUM_IN_AU>>1);
+		if ( ExpandNalUnitList(ppAu, pAu->uiCountUnitsNum, kuiExpandingSize) )
+			return NULL;	// out of memory
+		pAu = *ppAu;
+	}
+
+	pNu = pAu->pNalUnitsList[pAu->uiAvailUnitsNum++];	// ready for next nal position
+
+	memset(pNu, 0, sizeof(SNalUnit));	// Please do not remove this for cache intend!!
+	
+	return pNu;
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/mv_pred.cpp
@@ -1,0 +1,251 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mv_pred.c
+ *
+ * \brief	Get MV predictor and update motion vector of mb cache
+ *
+ * \date	05/22/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "mv_pred.h"
+#include "ls_defines.h"
+#include "mb_cache.h"
+
+namespace WelsDec {
+
+//basic iMVs prediction unit for iMVs partition width (4, 2, 1)
+void_t PredMv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+			 int32_t iPartIdx, int32_t iPartWidth, int8_t iRef, int16_t iMVP[2])
+{
+	const uint8_t kuiLeftIdx	= g_kuiCache30ScanIdx[iPartIdx] - 1;
+	const uint8_t kuiTopIdx		= g_kuiCache30ScanIdx[iPartIdx] - 6;
+	const uint8_t kuiRightTopIdx= kuiTopIdx + iPartWidth;
+	const uint8_t kuiLeftTopIdx	= kuiTopIdx - 1;	
+
+	const int8_t kiLeftRef      = iRefIndex[0][kuiLeftIdx];
+	const int8_t kiTopRef       = iRefIndex[0][ kuiTopIdx];
+	const int8_t kiRightTopRef = iRefIndex[0][kuiRightTopIdx];
+	const int8_t kiLeftTopRef  = iRefIndex[0][ kuiLeftTopIdx];
+	int8_t iDiagonalRef  = kiRightTopRef;
+
+	int8_t iMatchRef = 0;
+
+
+	int16_t iAMV[2], iBMV[2], iCMV[2];
+
+	*(int32_t*)iAMV = INTD32(iMotionVector[0][     kuiLeftIdx]);
+	*(int32_t*)iBMV = INTD32(iMotionVector[0][      kuiTopIdx]);
+	*(int32_t*)iCMV = INTD32(iMotionVector[0][kuiRightTopIdx]);
+
+	if (REF_NOT_AVAIL == iDiagonalRef) 
+	{
+		iDiagonalRef = kiLeftTopRef;
+		*(int32_t*)iCMV = INTD32(iMotionVector[0][kuiLeftTopIdx]);
+	}
+
+	iMatchRef = (iRef == kiLeftRef) + (iRef == kiTopRef) + (iRef == iDiagonalRef);	
+
+	if (REF_NOT_AVAIL == kiTopRef && REF_NOT_AVAIL == iDiagonalRef && kiLeftRef >= REF_NOT_IN_LIST) 
+	{
+		ST32(iMVP, LD32(iAMV));
+		return;
+	}
+
+	if (1 == iMatchRef) 
+	{
+		if (iRef == kiLeftRef) 
+		{
+			ST32(iMVP, LD32(iAMV));
+		}
+		else if (iRef == kiTopRef) 
+		{
+			ST32(iMVP, LD32(iBMV));
+		}
+		else
+		{
+			ST32(iMVP, LD32(iCMV));
+		}
+	}
+	else
+	{
+		iMVP[0] = WelsMedian(iAMV[0], iBMV[0], iCMV[0]);
+		iMVP[1] = WelsMedian(iAMV[1], iBMV[1], iCMV[1]);
+	}	
+}
+void_t PredInter8x16Mv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+						int32_t iPartIdx, int8_t iRef, int16_t iMVP[2])
+{
+	if (0 == iPartIdx) 
+	{
+		const int8_t kiLeftRef = iRefIndex[0][6];
+		if (iRef == kiLeftRef)
+		{
+			ST32( iMVP, LD32(&iMotionVector[0][6][0]) );
+			return;
+		}		
+	}
+	else // 1 == iPartIdx
+	{
+		int8_t iDiagonalRef = iRefIndex[0][5]; //top-right
+		int8_t index = 5;
+		if (REF_NOT_AVAIL == iDiagonalRef)
+		{
+			iDiagonalRef = iRefIndex[0][2]; //top-left for 8*8 block(index 1)
+			index = 2;
+		}
+		if (iRef == iDiagonalRef) 
+		{
+			ST32( iMVP, LD32(&iMotionVector[0][index][0]) );
+			return;
+		}	
+	}
+
+	PredMv(iMotionVector, iRefIndex, iPartIdx, 2, iRef, iMVP);
+}
+void_t PredInter16x8Mv(int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+						int32_t iPartIdx, int8_t iRef, int16_t iMVP[2])
+{
+	if (0 == iPartIdx) 
+	{
+		const int8_t kiTopRef = iRefIndex[0][1];
+		if (iRef == kiTopRef)
+		{
+			ST32(iMVP, LD32(&iMotionVector[0][1][0]));
+			return;
+		}
+	}
+	else // 8 == iPartIdx
+	{
+		const int8_t kiLeftRef = iRefIndex[0][18];
+		if (iRef == kiLeftRef) 
+		{
+			ST32(iMVP, LD32(&iMotionVector[0][18][0]));
+			return;
+		}
+	}
+
+	PredMv(iMotionVector, iRefIndex, iPartIdx, 4, iRef, iMVP);
+}
+
+//update iMVs and iRefIndex cache for current MB, only for P_16*16 (SKIP inclusive)
+/* can be further optimized */
+void_t UpdateP16x16MotionInfo( PDqLayer pCurDqLayer, int8_t iRef, int16_t iMVs[2])
+{
+	const int16_t kiRef2		= (iRef << 8) | iRef;
+	const int32_t kiMV32		= LD32(iMVs);	
+	int32_t i;	
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	
+	for (i = 0; i < 16; i+=4) 
+	{
+		//mb
+		const uint8_t kuiScan4Idx = g_kuiScan4[i];
+		const uint8_t kuiScan4IdxPlus4= 4 + kuiScan4Idx;
+
+ 		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2 );
+		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2 );
+	
+		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4Idx ], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4IdxPlus4], kiMV32 );
+	}
+}
+
+//update iRefIndex and iMVs of Mb, only for P16x8 
+/*need further optimization, mb_cache not work */
+void_t UpdateP16x8MotionInfo(PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+							  int32_t iPartIdx, int8_t iRef, int16_t iMVs[2])
+{
+	const int16_t kiRef2 = (iRef << 8) | iRef;
+	const int32_t kiMV32 = LD32(iMVs);
+	int32_t i;	
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	for (i = 0; i < 2; i++, iPartIdx+=4) 
+	{
+		const uint8_t kuiScan4Idx      = g_kuiScan4[iPartIdx];
+		const uint8_t kuiScan4IdxPlus4 = 4 + kuiScan4Idx;
+		const uint8_t kuiCacheIdx      = g_kuiCache30ScanIdx[iPartIdx];
+		const uint8_t kuiCacheIdxPlus6 = 6 + kuiCacheIdx;
+
+		//mb
+		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2 );
+		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4Idx ], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4IdxPlus4], kiMV32 );
+		//cache
+		ST16( &iRefIndex[0][kuiCacheIdx ], kiRef2 );
+		ST16( &iRefIndex[0][kuiCacheIdxPlus6], kiRef2 );
+		ST32( iMotionVector[0][  kuiCacheIdx ], kiMV32 );
+		ST32( iMotionVector[0][1+kuiCacheIdx ], kiMV32 );
+		ST32( iMotionVector[0][  kuiCacheIdxPlus6], kiMV32 );
+		ST32( iMotionVector[0][1+kuiCacheIdxPlus6], kiMV32 );
+	}	
+}
+//update iRefIndex and iMVs of both Mb and Mb_cache, only for P8x16
+void_t UpdateP8x16MotionInfo(PDqLayer pCurDqLayer, int16_t iMotionVector[LIST_A][30][MV_A], int8_t iRefIndex[LIST_A][30], 
+							  int32_t iPartIdx, int8_t iRef, int16_t iMVs[2])
+{
+	const int16_t kiRef2 = (iRef << 8) | iRef;
+	const int32_t kiMV32 = LD32(iMVs);
+	int32_t i;
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	
+	for (i = 0; i < 2; i++, iPartIdx+=8) 
+	{
+		const uint8_t kuiScan4Idx = g_kuiScan4[iPartIdx];
+		const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+		const uint8_t kuiScan4IdxPlus4= 4 + kuiScan4Idx;
+		const uint8_t kuiCacheIdxPlus6= 6 + kuiCacheIdx;
+
+		//mb
+		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4Idx ], kiRef2 );
+		ST16( &pCurDqLayer->pRefIndex[0][iMbXy][kuiScan4IdxPlus4], kiRef2 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4Idx ], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4Idx ], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][  kuiScan4IdxPlus4], kiMV32 );
+		ST32( pCurDqLayer->pMv[0][iMbXy][1+kuiScan4IdxPlus4], kiMV32 );
+		//cache
+		ST16( &iRefIndex[0][kuiCacheIdx ], kiRef2 );
+		ST16( &iRefIndex[0][kuiCacheIdxPlus6], kiRef2 );
+		ST32( iMotionVector[0][  kuiCacheIdx ], kiMV32 );
+		ST32( iMotionVector[0][1+kuiCacheIdx ], kiMV32 );
+		ST32( iMotionVector[0][  kuiCacheIdxPlus6], kiMV32 );
+		ST32( iMotionVector[0][1+kuiCacheIdxPlus6], kiMV32 );
+	}	
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/parse_mb_syn_cavlc.cpp
@@ -1,0 +1,1627 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	parse_mb_syn_cavlc.c
+ *
+ * \brief	Interfaces implementation for parsing the syntax of MB
+ *
+ * \date	03/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+
+#include "parse_mb_syn_cavlc.h"
+#include "error_code.h"
+#include "dec_golomb.h"
+#include "macros.h"
+#include "vlc_decoder.h"
+#include "bit_stream.h"
+#include "ls_defines.h"
+#include "mv_pred.h" 
+#include "decode_slice.h"
+
+namespace WelsDec {
+
+void_t PredPSkipMvFromNeighbor( PDqLayer pCurLayer, int16_t iMvp[2] )
+{
+	bool_t bTopAvail, bLeftTopAvail, bRightTopAvail, bLeftAvail;
+
+	int32_t iCurSliceIdc, iTopSliceIdc, iLeftTopSliceIdc, iRightTopSliceIdc, iLeftSliceIdc; 
+	int32_t iLeftTopType, iRightTopType, iTopType, iLeftType;
+	int32_t iCurX, iCurY, iCurXy, iLeftXy, iTopXy, iLeftTopXy, iRightTopXy;
+
+	int8_t iLeftRef;
+	int8_t iTopRef;
+	int8_t iRightTopRef;
+	int8_t iLeftTopRef;
+	int8_t iDiagonalRef;
+	int8_t iMatchRef;
+	int16_t iMvA[2], iMvB[2], iMvC[2], iMvD[2];
+
+	iCurXy = pCurLayer->iMbXyIndex;
+	iCurX  = pCurLayer->iMbX;
+	iCurY  = pCurLayer->iMbY;
+	iCurSliceIdc = pCurLayer->pSliceIdc[iCurXy];
+
+	if( iCurX != 0)
+	{
+		iLeftXy = iCurXy- 1;
+		iLeftSliceIdc = pCurLayer->pSliceIdc[iLeftXy];
+		bLeftAvail = (iLeftSliceIdc == iCurSliceIdc);
+	}	
+	else
+	{
+		bLeftAvail = 0;
+		bLeftTopAvail = 0;
+	}
+	
+	if( iCurY != 0)
+	{
+		iTopXy = iCurXy - pCurLayer->iMbWidth;
+		iTopSliceIdc = pCurLayer->pSliceIdc[iTopXy];
+		bTopAvail = (iTopSliceIdc == iCurSliceIdc);
+		if (iCurX != 0)
+		{
+			iLeftTopXy = iTopXy - 1;
+			iLeftTopSliceIdc = pCurLayer->pSliceIdc[iLeftTopXy];
+			bLeftTopAvail = (iLeftTopSliceIdc  == iCurSliceIdc);
+		}
+		else
+		{
+			bLeftTopAvail = 0;
+		}
+		if (iCurX != (pCurLayer->iMbWidth-1))
+		{
+			iRightTopXy = iTopXy + 1;
+			iRightTopSliceIdc = pCurLayer->pSliceIdc[iRightTopXy];
+			bRightTopAvail = (iRightTopSliceIdc == iCurSliceIdc);
+		}
+		else
+		{
+			bRightTopAvail = 0;
+		}
+	}
+	else
+	{
+		bTopAvail = 0;
+		bLeftTopAvail = 0;
+		bRightTopAvail = 0;
+	}
+
+	iLeftType = ((iCurX!=0 && bLeftAvail) ? pCurLayer->pMbType[iLeftXy]: 0);
+	iTopType = ((iCurY!=0 && bTopAvail) ? pCurLayer->pMbType[iTopXy]: 0);
+	iLeftTopType = ((iCurX!=0 &&iCurY!=0 && bLeftTopAvail)
+					? pCurLayer->pMbType[iLeftTopXy]: 0);
+	iRightTopType = ((iCurX!=pCurLayer->iMbWidth-1 &&iCurY!=0 && bRightTopAvail)
+					? pCurLayer->pMbType[iRightTopXy]: 0);
+
+	/*get neb mv&iRefIdxArray*/
+	/*left*/
+	if (bLeftAvail && IS_INTER(iLeftType)) 
+	{
+		ST32(iMvA, LD32(pCurLayer->pMv[0][iLeftXy][3]));
+		iLeftRef = pCurLayer->pRefIndex[0][iLeftXy][3];
+	}
+	else
+	{
+		ST32(iMvA, 0);
+		if (0 == bLeftAvail) //not available
+		{
+			iLeftRef = REF_NOT_AVAIL; 
+		}
+		else //available but is intra mb type
+		{
+			iLeftRef = REF_NOT_IN_LIST; 
+		}
+	}
+	if (REF_NOT_AVAIL == iLeftRef ||
+		(0 == iLeftRef && 0 == *(int32_t*)iMvA)) 
+	{
+		ST32( iMvp, 0 );
+		return;
+	}
+	
+	/*top*/
+	if (bTopAvail && IS_INTER(iTopType))
+	{
+		ST32( iMvB, LD32(pCurLayer->pMv[0][iTopXy][12]) );
+		iTopRef = pCurLayer->pRefIndex[0][iTopXy][12];
+	}
+	else
+	{
+		ST32( iMvB, 0 );
+		if (0 == bTopAvail) //not available
+		{
+		    iTopRef = REF_NOT_AVAIL;
+		}
+		else //available but is intra mb type
+		{
+			iTopRef = REF_NOT_IN_LIST;
+		}
+	}
+	if (REF_NOT_AVAIL == iTopRef ||
+		(0 == iTopRef  && 0 == *(int32_t*)iMvB)) 
+	{
+		ST32( iMvp, 0 );
+		return;
+	}
+
+	/*right_top*/
+	if (bRightTopAvail && IS_INTER(iRightTopType))
+	{
+		ST32(iMvC, LD32(pCurLayer->pMv[0][iRightTopXy][12]));
+		iRightTopRef = pCurLayer->pRefIndex[0][iRightTopXy][12];
+	}
+	else
+	{
+		ST32(iMvC, 0);
+		if (0 == bRightTopAvail) //not available
+		{
+			iRightTopRef = REF_NOT_AVAIL;
+		}
+		else //available but is intra mb type
+		{
+			iRightTopRef = REF_NOT_IN_LIST;
+		}			
+	}
+
+	/*left_top*/
+	if (bLeftTopAvail && IS_INTER(iLeftTopType))
+	{
+		ST32(iMvD, LD32(pCurLayer->pMv[0][iLeftTopXy][15]));
+		iLeftTopRef = pCurLayer->pRefIndex[0][iLeftTopXy][15];
+	}
+	else
+	{
+		ST32(iMvD, 0);
+		if (0 == bLeftTopAvail) //not available
+		{
+			iLeftTopRef = REF_NOT_AVAIL;
+		}
+		else //available but is intra mb type
+		{
+			iLeftTopRef = REF_NOT_IN_LIST;
+		}			 
+	}
+		
+	iDiagonalRef = iRightTopRef;
+	if (REF_NOT_AVAIL == iDiagonalRef) 
+	{
+		iDiagonalRef = iLeftTopRef;
+		*(int32_t*)iMvC = *(int32_t*)iMvD;
+	}
+
+	if (REF_NOT_AVAIL == iTopRef && REF_NOT_AVAIL == iDiagonalRef && iLeftRef >= REF_NOT_IN_LIST) 
+	{
+		ST32(iMvp, LD32(iMvA));
+		return;
+	}
+
+	iMatchRef = (0 == iLeftRef) + (0 == iTopRef) + (0 == iDiagonalRef);	
+	if (1 == iMatchRef) 
+	{
+		if (0 == iLeftRef) 
+		{
+			ST32(iMvp, LD32(iMvA));
+		}
+		else if (0 == iTopRef) 
+		{
+			ST32(iMvp, LD32(iMvB));
+		}
+		else
+		{
+			ST32(iMvp, LD32(iMvC));
+		}
+	}
+	else
+	{
+		iMvp[0] = WelsMedian(iMvA[0], iMvB[0], iMvC[0]);
+		iMvp[1] = WelsMedian(iMvA[1], iMvB[1], iMvC[1]);
+	}
+}
+
+void_t GetNeighborAvailMbType( PNeighAvail pNeighAvail, PDqLayer pCurLayer )
+{
+	int32_t iCurSliceIdc, iTopSliceIdc, iLeftTopSliceIdc, iRightTopSliceIdc, iLeftSliceIdc;
+	int32_t iCurXy, iTopXy, iLeftXy, iLeftTopXy, iRightTopXy;
+	int32_t iCurX, iCurY;
+
+	iCurXy = pCurLayer->iMbXyIndex;
+	iCurX  = pCurLayer->iMbX;
+	iCurY  = pCurLayer->iMbY;
+	iCurSliceIdc = pCurLayer->pSliceIdc[iCurXy];
+	if( iCurX != 0)
+	{
+		iLeftXy = iCurXy- 1;
+		iLeftSliceIdc = pCurLayer->pSliceIdc[iLeftXy];
+		pNeighAvail->iLeftAvail = (iLeftSliceIdc == iCurSliceIdc);
+	}	
+	else
+	{
+		pNeighAvail->iLeftAvail = 0;
+		pNeighAvail->iLeftTopAvail = 0;
+	}
+
+	if( iCurY != 0)
+	{
+		iTopXy = iCurXy - pCurLayer->iMbWidth;
+		iTopSliceIdc = pCurLayer->pSliceIdc[iTopXy];
+		pNeighAvail->iTopAvail = (iTopSliceIdc == iCurSliceIdc);
+		if (iCurX != 0)
+		{
+			iLeftTopXy = iTopXy - 1;
+			iLeftTopSliceIdc = pCurLayer->pSliceIdc[iLeftTopXy];
+			pNeighAvail->iLeftTopAvail = (iLeftTopSliceIdc == iCurSliceIdc);
+		}
+		else
+		{
+			pNeighAvail->iLeftTopAvail = 0;
+		}
+		if (iCurX != (pCurLayer->iMbWidth-1))
+		{
+			iRightTopXy = iTopXy + 1;
+			iRightTopSliceIdc = pCurLayer->pSliceIdc[iRightTopXy];
+			pNeighAvail->iRightTopAvail = (iRightTopSliceIdc == iCurSliceIdc);
+		}
+		else
+		{
+			pNeighAvail->iRightTopAvail = 0;
+		}
+	}
+	else
+	{
+		pNeighAvail->iTopAvail = 0;
+		pNeighAvail->iLeftTopAvail = 0;
+		pNeighAvail->iRightTopAvail = 0;
+	}
+
+	pNeighAvail->iLeftType     = ( pNeighAvail->iLeftAvail     ? pCurLayer->pMbType[iLeftXy]     : 0 );
+	pNeighAvail->iTopType      = ( pNeighAvail->iTopAvail      ? pCurLayer->pMbType[iTopXy]      : 0 );
+	pNeighAvail->iLeftTopType  = ( pNeighAvail->iLeftTopAvail  ? pCurLayer->pMbType[iLeftTopXy]  : 0 );
+	pNeighAvail->iRightTopType = ( pNeighAvail->iRightTopAvail ? pCurLayer->pMbType[iRightTopXy] : 0 );
+}
+void_t WelsFillCacheNonZeroCount(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, PDqLayer pCurLayer) //no matter slice type, intra_pred_constrained_flag
+{
+	int32_t iCurXy  = pCurLayer->iMbXyIndex;
+	int32_t iTopXy  = 0;
+	int32_t iLeftXy = 0;
+
+	GetNeighborAvailMbType( pNeighAvail, pCurLayer );
+
+	if ( pNeighAvail->iTopAvail )
+	{
+		iTopXy = iCurXy - pCurLayer->iMbWidth;
+	}
+	if ( pNeighAvail->iLeftAvail )
+	{
+		iLeftXy = iCurXy - 1;
+	}
+
+	//stuff non_zero_coeff_count from pNeighAvail(left and top)
+	if (pNeighAvail->iTopAvail)
+	{
+		ST32(&pNonZeroCount[1], LD32(&pCurLayer->pNzc[iTopXy][12]));
+        pNonZeroCount[0] = pNonZeroCount[5] = pNonZeroCount[29] = 0;
+		ST16(&pNonZeroCount[6], LD16(&pCurLayer->pNzc[iTopXy][20]));
+		ST16(&pNonZeroCount[30], LD16(&pCurLayer->pNzc[iTopXy][22]));
+	}
+	else
+	{
+		ST32(&pNonZeroCount[1], 0xFFFFFFFFU);
+        pNonZeroCount[0] = pNonZeroCount[5] = pNonZeroCount[29] = 0xFF;
+		ST16(&pNonZeroCount[6], 0xFFFF);
+		ST16(&pNonZeroCount[30], 0xFFFF);
+	}
+
+	if (pNeighAvail->iLeftAvail)
+	{
+		pNonZeroCount[8 * 1] = pCurLayer->pNzc[iLeftXy][3];
+		pNonZeroCount[8 * 2] = pCurLayer->pNzc[iLeftXy][7];
+		pNonZeroCount[8 * 3] = pCurLayer->pNzc[iLeftXy][11];
+		pNonZeroCount[8 * 4] = pCurLayer->pNzc[iLeftXy][15];
+
+		pNonZeroCount[5 + 8 * 1] = pCurLayer->pNzc[iLeftXy][17];
+		pNonZeroCount[5 + 8 * 2] = pCurLayer->pNzc[iLeftXy][21];
+		pNonZeroCount[5 + 8 * 4] = pCurLayer->pNzc[iLeftXy][19]; 
+		pNonZeroCount[5 + 8 * 5] = pCurLayer->pNzc[iLeftXy][23];
+	}
+	else 
+	{
+		pNonZeroCount[8 * 1] = 
+		pNonZeroCount[8 * 2] = 
+		pNonZeroCount[8 * 3] = 
+		pNonZeroCount[8 * 4] = -1;//unavailable
+
+		pNonZeroCount[5 + 8 * 1] = 
+		pNonZeroCount[5 + 8 * 2] = -1;//unavailable
+
+		pNonZeroCount[5 + 8 * 4] = 
+		pNonZeroCount[5 + 8 * 5] = -1;//unavailable
+	}
+}  
+void_t WelsFillCacheConstrain1Intra4x4(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer) //no matter slice type
+{
+	int32_t iCurXy  = pCurLayer->iMbXyIndex;
+	int32_t iTopXy  = 0;
+	int32_t iLeftXy = 0;
+
+	//stuff non_zero_coeff_count from pNeighAvail(left and top)
+	WelsFillCacheNonZeroCount( pNeighAvail, pNonZeroCount, pCurLayer );
+
+	if ( pNeighAvail->iTopAvail )
+	{
+		iTopXy = iCurXy - pCurLayer->iMbWidth;
+	}
+	if ( pNeighAvail->iLeftAvail )
+	{
+		iLeftXy = iCurXy - 1;
+	}	
+
+	//intra4x4_pred_mode			
+	if (pNeighAvail->iTopAvail && IS_INTRA4x4(pNeighAvail->iTopType)) //top
+	{		
+        ST32(pIntraPredMode+1, LD32(&pCurLayer->pIntraPredMode[iTopXy][0]));
+	}
+	else 
+	{
+		int32_t iPred;
+		if( IS_INTRA16x16( pNeighAvail->iTopType ) || ( MB_TYPE_INTRA_PCM == pNeighAvail->iTopType ) )
+			iPred= 0x02020202;
+		else
+			iPred= 0xffffffff;
+        ST32(pIntraPredMode+1, iPred);
+	}
+
+	if (pNeighAvail->iLeftAvail && IS_INTRA4x4(pNeighAvail->iLeftType)) //left
+	{
+		pIntraPredMode[ 0 + 8    ] = pCurLayer->pIntraPredMode[iLeftXy][4];
+		pIntraPredMode[ 0 + 8 * 2] = pCurLayer->pIntraPredMode[iLeftXy][5];
+		pIntraPredMode[ 0 + 8 * 3] = pCurLayer->pIntraPredMode[iLeftXy][6];
+		pIntraPredMode[ 0 + 8 * 4] = pCurLayer->pIntraPredMode[iLeftXy][3];
+	}
+	else 
+	{	
+		int8_t iPred;
+		if( IS_INTRA16x16( pNeighAvail->iLeftType ) || ( MB_TYPE_INTRA_PCM == pNeighAvail->iLeftType ) )
+			iPred= 2;
+		else
+			iPred= -1;
+		pIntraPredMode[ 0 + 8    ] = 
+		pIntraPredMode[ 0 + 8 * 2] = 
+		pIntraPredMode[ 0 + 8 * 3] = 
+		pIntraPredMode[ 0 + 8 * 4] = iPred;
+	}	
+} 
+
+void_t WelsFillCacheConstrain0Intra4x4(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, int8_t* pIntraPredMode, PDqLayer pCurLayer) //no matter slice type
+{
+	int32_t iCurXy  = pCurLayer->iMbXyIndex;
+	int32_t iTopXy  = 0;
+	int32_t iLeftXy = 0;
+
+	//stuff non_zero_coeff_count from pNeighAvail(left and top)
+	WelsFillCacheNonZeroCount( pNeighAvail, pNonZeroCount, pCurLayer );
+
+	if ( pNeighAvail->iTopAvail )
+	{
+		iTopXy = iCurXy - pCurLayer->iMbWidth;
+	}
+	if ( pNeighAvail->iLeftAvail )
+	{
+		iLeftXy = iCurXy - 1;
+	}	
+
+	//intra4x4_pred_mode		
+	if (pNeighAvail->iTopAvail && IS_INTRA4x4(pNeighAvail->iTopType)) //top
+	{
+        ST32(pIntraPredMode + 1, LD32(&pCurLayer->pIntraPredMode[iTopXy][0]));
+	}
+	else 
+	{
+		int32_t iPred;
+		if( pNeighAvail->iTopAvail )
+			iPred= 0x02020202;
+		else
+			iPred= 0xffffffff;
+        ST32(pIntraPredMode + 1, iPred);
+	}
+
+	if (pNeighAvail->iLeftAvail && IS_INTRA4x4(pNeighAvail->iLeftType)) //left
+	{
+		pIntraPredMode[ 0 + 8 * 1] = pCurLayer->pIntraPredMode[iLeftXy][4];
+		pIntraPredMode[ 0 + 8 * 2] = pCurLayer->pIntraPredMode[iLeftXy][5];
+		pIntraPredMode[ 0 + 8 * 3] = pCurLayer->pIntraPredMode[iLeftXy][6];
+		pIntraPredMode[ 0 + 8 * 4] = pCurLayer->pIntraPredMode[iLeftXy][3];
+	}
+	else 
+	{	
+		int8_t iPred;
+		if( pNeighAvail->iLeftAvail )
+			iPred= 2;
+		else
+			iPred= -1;
+		pIntraPredMode[ 0 + 8 * 1] = 
+		pIntraPredMode[ 0 + 8 * 2] = 
+		pIntraPredMode[ 0 + 8 * 3] = 
+		pIntraPredMode[ 0 + 8 * 4] = iPred;
+	}
+} 
+
+void_t WelsFillCacheInter(PNeighAvail pNeighAvail, uint8_t* pNonZeroCount, 
+						  int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PDqLayer pCurLayer)
+{
+	int32_t iCurXy      = pCurLayer->iMbXyIndex;
+	int32_t iTopXy      = 0;
+	int32_t iLeftXy     = 0;
+	int32_t iLeftTopXy  = 0;
+	int32_t iRightTopXy = 0;
+
+	//stuff non_zero_coeff_count from pNeighAvail(left and top)
+	WelsFillCacheNonZeroCount( pNeighAvail, pNonZeroCount, pCurLayer );
+
+	if ( pNeighAvail->iTopAvail )
+	{
+		iTopXy = iCurXy - pCurLayer->iMbWidth;
+	}
+	if ( pNeighAvail->iLeftAvail )
+	{
+		iLeftXy = iCurXy - 1;
+	}
+	if ( pNeighAvail->iLeftTopAvail )
+	{
+		iLeftTopXy = iCurXy - 1 - pCurLayer->iMbWidth;
+	}
+	if ( pNeighAvail->iRightTopAvail )
+	{
+		iRightTopXy = iCurXy + 1- pCurLayer->iMbWidth;
+	}
+
+	//stuff mv_cache and iRefIdxArray from left and top (inter)
+	if (pNeighAvail->iLeftAvail && IS_INTER(pNeighAvail->iLeftType)) 
+	{
+		ST32(iMvArray[0][ 6], LD32(pCurLayer->pMv[0][iLeftXy][ 3]));
+		ST32(iMvArray[0][12], LD32(pCurLayer->pMv[0][iLeftXy][ 7]));
+		ST32(iMvArray[0][18], LD32(pCurLayer->pMv[0][iLeftXy][11]));
+		ST32(iMvArray[0][24], LD32(pCurLayer->pMv[0][iLeftXy][15]));
+		iRefIdxArray[0][ 6] = pCurLayer->pRefIndex[0][iLeftXy][ 3];
+		iRefIdxArray[0][12] = pCurLayer->pRefIndex[0][iLeftXy][ 7];
+		iRefIdxArray[0][18] = pCurLayer->pRefIndex[0][iLeftXy][11];
+		iRefIdxArray[0][24] = pCurLayer->pRefIndex[0][iLeftXy][15];
+	}
+	else
+	{
+		ST32(iMvArray[0][ 6], 0);
+		ST32(iMvArray[0][12], 0);
+		ST32(iMvArray[0][18], 0);
+		ST32(iMvArray[0][24], 0);
+
+		if (0 == pNeighAvail->iLeftAvail) //not available
+		{
+			iRefIdxArray[0][ 6] = 
+			iRefIdxArray[0][12] = 
+			iRefIdxArray[0][18] = 
+			iRefIdxArray[0][24] = REF_NOT_AVAIL; 
+		}
+		else //available but is intra mb type
+		{
+			iRefIdxArray[0][ 6] = 
+			iRefIdxArray[0][12] = 
+			iRefIdxArray[0][18] = 
+			iRefIdxArray[0][24] = REF_NOT_IN_LIST;
+		}
+	}
+	if (pNeighAvail->iLeftTopAvail && IS_INTER(pNeighAvail->iLeftTopType))
+	{
+		ST32(iMvArray[0][0], LD32(pCurLayer->pMv[0][iLeftTopXy][15]));
+        iRefIdxArray[0][0] = pCurLayer->pRefIndex[0][iLeftTopXy][15];
+	}
+	else
+	{
+		ST32(iMvArray[0][0], 0);
+		if (0 == pNeighAvail->iLeftTopAvail) //not available
+		{
+			iRefIdxArray[0][0] = REF_NOT_AVAIL;
+		}
+		else //available but is intra mb type
+		{
+			iRefIdxArray[0][0] = REF_NOT_IN_LIST;
+		}			 
+	}
+
+	if (pNeighAvail->iTopAvail && IS_INTER(pNeighAvail->iTopType))
+	{
+		ST64(iMvArray[0][1], LD64(pCurLayer->pMv[0][iTopXy][12]));
+		ST64(iMvArray[0][3], LD64(pCurLayer->pMv[0][iTopXy][14]));
+        ST32(&iRefIdxArray[0][1], LD32(&pCurLayer->pRefIndex[0][iTopXy][12]));
+	}
+	else
+	{
+		ST64(iMvArray[0][1], 0);
+		ST64(iMvArray[0][3], 0);
+
+		if (0 == pNeighAvail->iTopAvail) //not available
+		{
+			iRefIdxArray[0][1] = 
+			iRefIdxArray[0][2] = 
+			iRefIdxArray[0][3] = 
+			iRefIdxArray[0][4] = REF_NOT_AVAIL;
+		}
+		else //available but is intra mb type
+		{
+			iRefIdxArray[0][1] = 
+			iRefIdxArray[0][2] = 
+			iRefIdxArray[0][3] = 
+			iRefIdxArray[0][4] = REF_NOT_IN_LIST;
+		}
+	}
+
+	if (pNeighAvail->iRightTopAvail && IS_INTER(pNeighAvail->iRightTopType))
+	{
+		ST32(iMvArray[0][5], LD32(pCurLayer->pMv[0][iRightTopXy][12]));
+		iRefIdxArray[0][5] = pCurLayer->pRefIndex[0][iRightTopXy][12];
+	}
+	else
+	{
+		ST32(iMvArray[0][5], 0);
+		if (0 == pNeighAvail->iRightTopAvail) //not available
+		{
+			iRefIdxArray[0][5] = REF_NOT_AVAIL;
+		}
+		else //available but is intra mb type
+		{
+			iRefIdxArray[0][5] = REF_NOT_IN_LIST;
+		}			
+	}
+
+	//right-top 4*4 block unavailable
+	ST32(iMvArray[0][ 9], 0);
+	ST32(iMvArray[0][21], 0);
+	ST32(iMvArray[0][11], 0);
+	ST32(iMvArray[0][17], 0);
+	ST32(iMvArray[0][23], 0);
+	iRefIdxArray[0][ 9] = 
+	iRefIdxArray[0][21] = 
+	iRefIdxArray[0][11] =
+	iRefIdxArray[0][17] =
+	iRefIdxArray[0][23] = REF_NOT_AVAIL;
+} 
+
+int32_t PredIntra4x4Mode(int8_t* pIntraPredMode, int32_t iIdx4)
+{
+	int8_t iTopMode  = pIntraPredMode[g_kuiScan8[iIdx4] - 8];
+	int8_t iLeftMode = pIntraPredMode[g_kuiScan8[iIdx4] - 1];
+	int8_t iBestMode;
+
+	if (-1 == iLeftMode || -1 == iTopMode)
+	{
+		iBestMode = 2;
+	}
+	else
+	{	
+		iBestMode = WELS_MIN(iLeftMode, iTopMode);
+	}
+	return iBestMode;
+}
+
+#define MAX_PRED_MODE_ID_I16x16  3
+#define MAX_PRED_MODE_ID_CHROMA  3
+#define MAX_PRED_MODE_ID_I4x4    8
+#define CHECK_I16_MODE(a, b, c, d)                           \
+                      ((a == g_ksI16PredInfo[a].iPredMode) &&  \
+					   (b >= g_ksI16PredInfo[a].iLeftAvail) && \
+					   (c >= g_ksI16PredInfo[a].iTopAvail) &&  \
+					   (d >= g_ksI16PredInfo[a].iLeftTopAvail));
+#define CHECK_CHROMA_MODE(a, b, c, d)                              \
+	                     ((a == g_ksChromaPredInfo[a].iPredMode) &&  \
+					      (b >= g_ksChromaPredInfo[a].iLeftAvail) && \
+					      (c >= g_ksChromaPredInfo[a].iTopAvail) &&  \
+					      (d >= g_ksChromaPredInfo[a].iLeftTopAvail));
+#define CHECK_I4_MODE(a, b, c, d)                              \
+	                 ((a == g_ksI4PredInfo[a].iPredMode) &&      \
+                      (b >= g_ksI4PredInfo[a].iLeftAvail) &&     \
+                      (c >= g_ksI4PredInfo[a].iTopAvail) &&      \
+                      (d >= g_ksI4PredInfo[a].iLeftTopAvail));
+
+
+int32_t CheckIntra16x16PredMode(uint8_t uiSampleAvail, int8_t* pMode)
+{
+	int32_t iLeftAvail     = uiSampleAvail & 0x04;
+	int32_t bLeftTopAvail  = uiSampleAvail & 0x02;
+	int32_t iTopAvail      = uiSampleAvail & 0x01;
+
+	if (*pMode > MAX_PRED_MODE_ID_I16x16)
+	{
+		return ERR_INFO_INVALID_I16x16_PRED_MODE;
+	}
+
+	if (I16_PRED_DC == *pMode) 
+	{
+		if (iLeftAvail && iTopAvail) 
+		{
+			return 0;
+		}
+		else if (iLeftAvail) 
+		{
+			*pMode = I16_PRED_DC_L;
+		}
+		else if (iTopAvail) 
+		{
+			*pMode = I16_PRED_DC_T;
+		}
+		else
+		{
+			*pMode = I16_PRED_DC_128;
+		}
+	}
+	else 
+	{
+		bool_t bModeAvail = CHECK_I16_MODE(*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
+		if (0 == bModeAvail) 
+		{
+			return ERR_INFO_INVALID_I16x16_PRED_MODE;
+		}
+	}
+	return 0;
+}
+
+
+int32_t CheckIntraChromaPredMode(uint8_t uiSampleAvail, int8_t* pMode)
+{
+	int32_t iLeftAvail     = uiSampleAvail & 0x04;
+	int32_t bLeftTopAvail  = uiSampleAvail & 0x02;
+	int32_t iTopAvail      = uiSampleAvail & 0x01;
+
+	if (*pMode > MAX_PRED_MODE_ID_CHROMA)
+	{
+		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+	}
+
+	if (C_PRED_DC == *pMode) 
+	{
+		if (iLeftAvail && iTopAvail) 
+		{
+			return 0;
+		}
+		else if (iLeftAvail) 
+		{
+			*pMode = C_PRED_DC_L;
+		}
+		else if (iTopAvail) 
+		{
+			*pMode = C_PRED_DC_T;
+		}
+		else
+		{
+			*pMode = C_PRED_DC_128;
+		}
+	}
+	else 
+	{
+		bool_t bModeAvail = CHECK_CHROMA_MODE(*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
+		if (0 == bModeAvail) 
+		{
+			return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+		}
+	}
+	return 0;
+}
+
+int32_t CheckIntra4x4PredMode(int32_t* pSampleAvail, int8_t* pMode, int32_t iIndex)
+{
+	int8_t iIdx = g_kuiCache30ScanIdx[iIndex];
+	int32_t iLeftAvail     = pSampleAvail[iIdx-1];
+	int32_t iTopAvail      = pSampleAvail[iIdx-6];
+	int32_t bLeftTopAvail  = pSampleAvail[iIdx-7];	
+	int32_t bRightTopAvail = pSampleAvail[iIdx-5];
+
+	int8_t iFinalMode;
+
+	if (*pMode > MAX_PRED_MODE_ID_I4x4) 
+	{
+		return -1;
+	}
+
+	if (I4_PRED_DC == *pMode) 
+	{
+		if (iLeftAvail && iTopAvail) 
+		{
+			return *pMode;
+		}
+		else if (iLeftAvail) 
+		{
+			iFinalMode = I4_PRED_DC_L;
+		}
+		else if (iTopAvail) 
+		{
+			iFinalMode = I4_PRED_DC_T;
+		}
+		else
+		{
+			iFinalMode = I4_PRED_DC_128;
+		}
+	}
+	else
+	{
+		bool_t bModeAvail = CHECK_I4_MODE(*pMode, iLeftAvail, iTopAvail, bLeftTopAvail);
+		if (0 == bModeAvail) 
+		{
+			return -1;
+		}
+
+		iFinalMode = *pMode;
+
+		//if right-top unavailable, modify mode DDL and VL (padding rightmost pixel of top)  
+		if (I4_PRED_DDL == iFinalMode && 0 == bRightTopAvail)
+		{
+			iFinalMode = I4_PRED_DDL_TOP;
+		}
+		else if (I4_PRED_VL == iFinalMode && 0 == bRightTopAvail)
+		{
+			iFinalMode = I4_PRED_VL_TOP;
+		}		
+	}		
+	return iFinalMode;
+}
+
+void_t BsStartCavlc( PBitStringAux pBs )
+{
+	pBs->iIndex = ((pBs->pCurBuf - pBs->pStartBuf)<<3) - (16 - pBs->iLeftBits);
+}
+void_t BsEndCavlc( PBitStringAux pBs )
+{
+	pBs->pCurBuf   = pBs->pStartBuf + (pBs->iIndex>>3);
+	pBs->uiCurBits = ((((pBs->pCurBuf[0] << 8) | pBs->pCurBuf[1]) << 16) | (pBs->pCurBuf[2] << 8) | pBs->pCurBuf[3]) << (pBs->iIndex & 0x07);
+	pBs->pCurBuf  += 4;
+	pBs->iLeftBits = -16 + (pBs->iIndex&0x07);
+}
+
+
+// return: used bits	
+static int32_t CavlcGetTrailingOnesAndTotalCoeff(uint8_t &uiTotalCoeff, uint8_t &uiTrailingOnes, SReadBitsCache *pBitsCache, SVlcTable* pVlcTable, bool_t bChromaDc, int8_t nC)
+{
+	const uint8_t *kpVlcTableMoreBitsCountList[3] = {g_kuiVlcTableMoreBitsCount0, g_kuiVlcTableMoreBitsCount1, g_kuiVlcTableMoreBitsCount2}; 
+    int32_t iUsedBits = 0;
+	int32_t iIndexVlc, iIndexValue, iNcMapIdx;
+	uint32_t uiCount;
+	uint32_t uiValue;
+
+    if (bChromaDc)
+	{		
+		uiValue        = pBitsCache->uiCache32Bit >> 24;
+		iIndexVlc      = pVlcTable->kpChromaCoeffTokenVlcTable[uiValue][0];
+		uiCount        = pVlcTable->kpChromaCoeffTokenVlcTable[uiValue][1];
+		POP_BUFFER(pBitsCache, uiCount);
+		iUsedBits     += uiCount;
+		uiTrailingOnes = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][0];
+		uiTotalCoeff   = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][1];
+	}
+	else //luma
+	{
+		iNcMapIdx = g_kuiNcMapTable[nC];
+		if ( iNcMapIdx<= 2 )
+		{
+			uiValue = pBitsCache->uiCache32Bit >> 24;
+			if ( uiValue < g_kuiVlcTableNeedMoreBitsThread[iNcMapIdx] )
+			{					
+				POP_BUFFER(pBitsCache, 8);
+				iUsedBits  += 8;
+				iIndexValue = pBitsCache->uiCache32Bit >> ( 32 - kpVlcTableMoreBitsCountList[iNcMapIdx][uiValue]);
+				iIndexVlc   = pVlcTable->kpCoeffTokenVlcTable[iNcMapIdx+1][uiValue][iIndexValue][0];
+				uiCount     = pVlcTable->kpCoeffTokenVlcTable[iNcMapIdx+1][uiValue][iIndexValue][1];						
+				POP_BUFFER(pBitsCache, uiCount);
+				iUsedBits  += uiCount;
+			}
+			else
+			{
+				iIndexVlc  = pVlcTable->kpCoeffTokenVlcTable[0][iNcMapIdx][uiValue][0];
+				uiCount    = pVlcTable->kpCoeffTokenVlcTable[0][iNcMapIdx][uiValue][1];						
+				uiValue    = pBitsCache->uiCache32Bit >> (32 - uiCount);
+				POP_BUFFER(pBitsCache, uiCount);
+				iUsedBits += uiCount;
+			}
+		}
+		else
+		{
+			uiValue    = pBitsCache->uiCache32Bit >> (32 - 6);			
+			POP_BUFFER(pBitsCache, 6);
+			iUsedBits += 6;
+			iIndexVlc  = pVlcTable->kpCoeffTokenVlcTable[0][3][uiValue][0];  //differ
+		}		
+		uiTrailingOnes= g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][0];
+		uiTotalCoeff  = g_kuiVlcTrailingOneTotalCoeffTable[iIndexVlc][1];
+	}
+
+	return iUsedBits;
+}
+
+static int32_t CavlcGetLevelVal(int32_t iLevel[16], SReadBitsCache *pBitsCache, uint8_t uiTotalCoeff, uint8_t uiTrailingOnes)
+{
+    int32_t i, iUsedBits = 0;
+    int32_t iSuffixLength, iSuffixLengthSize, iLevelPrefix, iPrefixBits, iLevelCode, iThreshold;
+    uint32_t uiCache32Bit;
+	for (i = 0; i < uiTrailingOnes; i++) 
+	{		
+		iLevel[i] = 1 - ((pBitsCache->uiCache32Bit >> (30 - i)) & 0x02);
+	}		
+	POP_BUFFER(pBitsCache, uiTrailingOnes);
+	iUsedBits += uiTrailingOnes;
+		
+	iSuffixLength = (uiTotalCoeff > 10 && uiTrailingOnes < 3);
+	
+	for (; i < uiTotalCoeff; i++) 
+	{		
+		if(pBitsCache->uiRemainBits <= 16)		SHIFT_BUFFER(pBitsCache);
+#ifdef WIN32
+        uiCache32Bit = pBitsCache->uiCache32Bit;
+		WELS_GET_PREFIX_BITS(uiCache32Bit,iPrefixBits);
+#else
+		iPrefixBits = GetPrefixBits(pBitsCache->uiCache32Bit);
+#endif
+		POP_BUFFER(pBitsCache, iPrefixBits);
+		iUsedBits   += iPrefixBits;
+		iLevelPrefix = iPrefixBits - 1;
+
+		iLevelCode = (WELS_MIN(15, iLevelPrefix)) << iSuffixLength; //differ
+		iSuffixLengthSize = iSuffixLength;	
+
+		if (iLevelPrefix >= 14) 
+		{	
+			if (14 == iLevelPrefix && 0 == iSuffixLength)
+				iSuffixLengthSize = 4;
+			else if (15 == iLevelPrefix)
+				iSuffixLengthSize = 12;
+			else if(iLevelPrefix > 15)
+				iLevelCode += (1 << (iLevelPrefix - 3)) - 4096;
+
+			if (iLevelPrefix >= 15 && iSuffixLength == 0) 
+				iLevelCode += 15;
+		}
+
+		if(iSuffixLengthSize > 0) 
+		{
+			if(pBitsCache->uiRemainBits <= iSuffixLengthSize) SHIFT_BUFFER(pBitsCache);	
+			if(pBitsCache->uiRemainBits <= iSuffixLengthSize) 
+			return 0;
+			iLevelCode += (pBitsCache->uiCache32Bit >> (32 - iSuffixLengthSize)); 
+			POP_BUFFER(pBitsCache, iSuffixLengthSize);
+			iUsedBits  += iSuffixLengthSize;
+		}
+
+		iLevelCode += ((i == uiTrailingOnes) && (uiTrailingOnes < 3)) << 1;
+		iLevel[i]   = ((iLevelCode + 2) >> 1);
+		iLevel[i]  -= (iLevel[i] << 1) & (-(iLevelCode & 0x01));
+
+		iSuffixLength += !iSuffixLength;
+		iThreshold     = 3 << ( iSuffixLength - 1 );
+		iSuffixLength += ((iLevel[i] > iThreshold) || (iLevel[i] < -iThreshold)) && (iSuffixLength < 6);	
+	}
+
+	return iUsedBits;
+}
+
+static int32_t CavlcGetTotalZeros(int32_t &iZerosLeft, SReadBitsCache *pBitsCache, uint8_t uiTotalCoeff, SVlcTable* pVlcTable, bool_t bChromaDc)
+{
+	int32_t iCount, iUsedBits = 0;
+	const uint8_t *kpBitNumMap;
+	uint32_t uiValue;
+
+	int32_t iTotalZeroVlcIdx;
+	uint8_t uiTableType;
+	//chroma_dc (0 < uiTotalCoeff < 4); others (chroma_ac or luma: 0 < uiTotalCoeff < 16)
+
+	if ( bChromaDc )
+	{
+		iTotalZeroVlcIdx = uiTotalCoeff;
+		kpBitNumMap = g_kuiTotalZerosBitNumChromaMap;
+		uiTableType = bChromaDc;
+	} 
+	else
+	{
+		iTotalZeroVlcIdx = uiTotalCoeff;
+		kpBitNumMap = g_kuiTotalZerosBitNumMap;
+		uiTableType = 0;
+	}
+
+	iCount = kpBitNumMap[iTotalZeroVlcIdx-1];
+	if(pBitsCache->uiRemainBits < iCount) SHIFT_BUFFER(pBitsCache);// if uiRemainBits+16 still smaller than iCount?? potential bug
+	if(pBitsCache->uiRemainBits < iCount) 
+		return 0;
+	uiValue    = pBitsCache->uiCache32Bit >> ( 32 - iCount );
+	iCount     = pVlcTable->kpTotalZerosTable[uiTableType][iTotalZeroVlcIdx-1][uiValue][1];
+	POP_BUFFER(pBitsCache, iCount);
+	iUsedBits += iCount;
+	iZerosLeft = pVlcTable->kpTotalZerosTable[uiTableType][iTotalZeroVlcIdx-1][uiValue][0];
+
+	return iUsedBits;
+}
+static int32_t	CavlcGetRunBefore(int32_t iRun[16], SReadBitsCache *pBitsCache, uint8_t uiTotalCoeff, SVlcTable* pVlcTable, int32_t iZerosLeft)
+{
+    int32_t i, iUsedBits = 0;
+	uint32_t uiCount, uiValue, uiCache32Bit, iPrefixBits;
+	
+	for (i = 0; i < uiTotalCoeff-1; i++) 
+	{
+		if (iZerosLeft > 0) 
+		{			
+			uiCount = g_kuiZeroLeftBitNumMap[iZerosLeft];
+			if(pBitsCache->uiRemainBits < uiCount ) SHIFT_BUFFER(pBitsCache);
+			if(pBitsCache->uiRemainBits < uiCount) 
+			return 0;
+			uiValue = pBitsCache->uiCache32Bit >> ( 32 - uiCount );
+			if ( iZerosLeft < 7 )
+			{
+				uiCount = pVlcTable->kpZeroTable[iZerosLeft-1][uiValue][1];
+				POP_BUFFER(pBitsCache, uiCount);
+				iUsedBits += uiCount;
+				iRun[i] = pVlcTable->kpZeroTable[iZerosLeft-1][uiValue][0];
+			}
+			else
+			{
+				POP_BUFFER(pBitsCache, uiCount);
+				iUsedBits += uiCount;
+				if ( pVlcTable->kpZeroTable[6][uiValue][0] < 7 )
+				{		
+					iRun[i] = pVlcTable->kpZeroTable[6][uiValue][0];
+				}
+				else
+				{
+					if(pBitsCache->uiRemainBits < 16) SHIFT_BUFFER(pBitsCache);
+#ifdef WIN32
+					uiCache32Bit = pBitsCache->uiCache32Bit;
+					WELS_GET_PREFIX_BITS(uiCache32Bit, iPrefixBits);
+#else
+					iPrefixBits = GetPrefixBits(pBitsCache->uiCache32Bit);
+#endif
+					iRun[i] = iPrefixBits + 6;
+					POP_BUFFER(pBitsCache, iPrefixBits);
+					iUsedBits += iPrefixBits;
+				}
+			}			
+		}
+		else
+		{
+			return iUsedBits;
+		}
+		
+		iZerosLeft -= iRun[i];
+	}
+
+	iRun[uiTotalCoeff-1] = iZerosLeft;
+
+	return iUsedBits;
+}
+
+int32_t WelsResidualBlockCavlc(SVlcTable* pVlcTable, uint8_t* pNonZeroCountCache, PBitStringAux pBs, int32_t iIndex, int32_t iMaxNumCoeff, 
+									 const uint8_t *kpZigzagTable, int32_t iResidualProperty, int16_t *pTCoeff, int32_t iMbMode, uint8_t uiQp, PWelsDecoderContext pCtx)
+{
+	int32_t iLevel[16], iZerosLeft, iCoeffNum;
+	int32_t  iRun[16] = {0};
+	const uint8_t *kpBitNumMap;
+	int32_t iCurNonZeroCacheIdx, i;
+	const uint16_t *kpDequantCoeff = g_kuiDequantCoeff[uiQp];
+	int8_t nA, nB, nC;
+	uint8_t uiTotalCoeff, uiTrailingOnes;
+	int32_t iUsedBits = 0;	
+	int32_t iCurIdx   = pBs->iIndex;
+	uint8_t *pBuf     = ((uint8_t *)pBs->pStartBuf) + (iCurIdx >> 3);
+	bool_t  bChromaDc = (CHROMA_DC == iResidualProperty);
+	uint8_t bChroma   = (bChromaDc || CHROMA_AC == iResidualProperty);
+	SReadBitsCache sReadBitsCache;
+
+	sReadBitsCache.uiCache32Bit =  ((((pBuf[0]<<8) | pBuf[1]) << 16) | (pBuf[2]<<8) | pBuf[3]) << (iCurIdx&0x07);
+	sReadBitsCache.uiRemainBits = 32 - (iCurIdx & 0x07);
+    sReadBitsCache.pBuf = pBuf;
+	//////////////////////////////////////////////////////////////////////////
+	
+	if (bChroma) 
+	{
+		iCurNonZeroCacheIdx = g_kuiCacheNzcScanIdx[iIndex];
+		nA = pNonZeroCountCache[iCurNonZeroCacheIdx-1];
+		nB = pNonZeroCountCache[iCurNonZeroCacheIdx-8];
+
+		if (bChromaDc)
+		{
+			kpBitNumMap = g_kuiTotalZerosBitNumChromaMap;
+		}
+		else
+		{
+			kpBitNumMap = g_kuiTotalZerosBitNumMap;
+		}
+	}
+	else //luma
+	{
+		iCurNonZeroCacheIdx = g_kuiCacheNzcScanIdx[iIndex];
+		nA = pNonZeroCountCache[iCurNonZeroCacheIdx-1];
+		nB = pNonZeroCountCache[iCurNonZeroCacheIdx-8];
+
+		kpBitNumMap = g_kuiTotalZerosBitNumMap;
+	}
+
+	WELS_NON_ZERO_COUNT_AVERAGE( nC, nA, nB );
+
+	iUsedBits += CavlcGetTrailingOnesAndTotalCoeff(uiTotalCoeff, uiTrailingOnes, &sReadBitsCache, pVlcTable, bChromaDc, nC);
+
+	if ( iResidualProperty != CHROMA_DC && iResidualProperty != I16_LUMA_DC)
+	{
+		pNonZeroCountCache[iCurNonZeroCacheIdx] = uiTotalCoeff;
+		//////////////////////////////////////////////////////////////////////////
+	}
+	if (0 == uiTotalCoeff) 	
+	{
+		pBs->iIndex += iUsedBits;
+		return 0;
+	}	
+	if ( uiTrailingOnes > 3 || uiTotalCoeff > 16 ) /////////////////check uiTrailingOnes and uiTotalCoeff
+	{
+		return -1;
+	}
+	iUsedBits += CavlcGetLevelVal(iLevel, &sReadBitsCache, uiTotalCoeff, uiTrailingOnes);
+
+	if (uiTotalCoeff < iMaxNumCoeff) 
+	{
+	    iUsedBits += CavlcGetTotalZeros(iZerosLeft, &sReadBitsCache, uiTotalCoeff, pVlcTable, bChromaDc);
+	}
+	else
+	{
+		iZerosLeft = 0;
+	}
+
+	if (iZerosLeft < 0)
+	{
+		return ERR_INFO_CAVLC_INVALID_ZERO_LEFT;
+	}
+	iUsedBits += CavlcGetRunBefore(iRun, &sReadBitsCache, uiTotalCoeff, pVlcTable, iZerosLeft);
+
+	pBs->iIndex += iUsedBits;
+	iCoeffNum = -1;
+
+	if(iResidualProperty == CHROMA_DC){
+		//chroma dc scaling process, is kpDequantCoeff[0]? LevelScale(qPdc%6,0,0))<<(qPdc/6-6), the transform is done at construction.
+			switch(iMbMode)
+			{
+			case BASE_MB:
+				for(i=uiTotalCoeff-1; i>=0; --i)
+				{ //FIXME merge into rundecode?
+					int32_t j;
+					iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
+					j          = kpZigzagTable[ iCoeffNum ];
+					pTCoeff[j] = iLevel[i]*kpDequantCoeff[0];
+				}
+				break;
+			default:
+				break;
+			}
+	}	
+	else if(iResidualProperty == I16_LUMA_DC){ //DC coefficent, only call in Intra_16x16, base_mode_flag = 0
+		for(i=uiTotalCoeff-1; i>=0; --i){ //FIXME merge into rundecode?
+			int32_t j;
+			iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
+			j          = kpZigzagTable[ iCoeffNum ];
+			pTCoeff[j] = iLevel[i];
+		}
+	}
+    else{
+		switch(iMbMode)
+		{
+		case BASE_MB:
+			for(i=uiTotalCoeff-1; i>=0; --i){ //FIXME merge into  rundecode?
+				int32_t j;
+				iCoeffNum += iRun[i] + 1; //FIXME add 1 earlier ?
+				j          = kpZigzagTable[ iCoeffNum ];
+				pTCoeff[j] = iLevel[i]*kpDequantCoeff[j & 0x07];
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	return 0;		
+}
+
+int32_t ParseIntra4x4ModeConstrain0(PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer)
+{
+	int32_t iSampleAvail[5*6] = { 0 }; //initialize as 0
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	int32_t iFinalMode, i;	
+
+	uint8_t uiNeighAvail = 0;
+
+	if ( pNeighAvail->iLeftAvail )  //left
+	{
+		iSampleAvail[ 6] = 
+		iSampleAvail[12] =
+		iSampleAvail[18] =
+		iSampleAvail[24] = 1;
+	}
+	if ( pNeighAvail->iLeftTopAvail ) //top_left
+	{
+		iSampleAvail[0] = 1;
+	}
+	if ( pNeighAvail->iTopAvail ) //top
+	{
+		iSampleAvail[1] = 
+		iSampleAvail[2] = 
+		iSampleAvail[3] = 
+		iSampleAvail[4] = 1;
+	}
+	if ( pNeighAvail->iRightTopAvail ) //top_right
+	{
+		iSampleAvail[5] = 1;
+	}
+
+	uiNeighAvail = (iSampleAvail[6]<<2) | (iSampleAvail[0]<<1) | (iSampleAvail[1]);
+
+	for(i = 0; i < 16; i++)
+	{
+		const int32_t kiPrevIntra4x4PredMode = BsGetOneBit(pBs);//1bit
+		const int32_t kiPredMode = PredIntra4x4Mode(pIntraPredMode, i);
+
+		int8_t iBestMode;
+		if (kiPrevIntra4x4PredMode) 
+		{
+			iBestMode = kiPredMode;
+		}
+		else //kPrevIntra4x4PredMode == 0
+		{
+			const int32_t kiRemIntra4x4PredMode = BsGetBits(pBs, 3);//3bits				
+			if (kiRemIntra4x4PredMode < kiPredMode) 
+			{
+				iBestMode = kiRemIntra4x4PredMode;
+			}
+			else 
+			{
+				iBestMode = kiRemIntra4x4PredMode + 1;
+			}
+		}
+
+		iFinalMode = CheckIntra4x4PredMode(&iSampleAvail[0], &iBestMode, i);
+		if (iFinalMode < 0)
+		{
+			return ERR_INFO_INVALID_I4x4_PRED_MODE;
+		}
+
+		pCurDqLayer->pIntra4x4FinalMode[iMbXy][g_kuiScan4[i]] = iFinalMode;
+
+		pIntraPredMode[g_kuiScan8[i]] = iBestMode;
+
+		iSampleAvail[g_kuiCache30ScanIdx[i]] = 1;
+	}
+	ST32(&pCurDqLayer->pIntraPredMode[iMbXy][0], LD32(&pIntraPredMode[1 + 8 * 4]));
+	pCurDqLayer->pIntraPredMode[iMbXy][4] = pIntraPredMode[4 + 8 * 1];
+	pCurDqLayer->pIntraPredMode[iMbXy][5] = pIntraPredMode[4 + 8 * 2];
+	pCurDqLayer->pIntraPredMode[iMbXy][6] = pIntraPredMode[4 + 8 * 3];
+	pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe(pBs);
+	if (-1 == pCurDqLayer->pChromaPredMode[iMbXy] || CheckIntraChromaPredMode(uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy]))
+	{
+		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+	}
+
+	return 0;
+}
+
+int32_t ParseIntra4x4ModeConstrain1(PNeighAvail pNeighAvail, int8_t* pIntraPredMode, PBitStringAux pBs, PDqLayer pCurDqLayer)
+{
+	int32_t iSampleAvail[5*6] = { 0 }; //initialize as 0
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	int32_t iFinalMode, i;	
+
+	uint8_t uiNeighAvail = 0;
+
+	if ( pNeighAvail->iLeftAvail && IS_INTRA( pNeighAvail->iLeftType ) )  //left
+	{
+		iSampleAvail[ 6] = 
+		iSampleAvail[12] =
+		iSampleAvail[18] =
+		iSampleAvail[24] = 1;
+	}
+	if ( pNeighAvail->iLeftTopAvail && IS_INTRA( pNeighAvail->iLeftTopType ) ) //top_left
+	{
+		iSampleAvail[0] = 1;
+	}
+	if ( pNeighAvail->iTopAvail && IS_INTRA( pNeighAvail->iTopType ) ) //top
+	{
+		iSampleAvail[1] = 
+		iSampleAvail[2] = 
+		iSampleAvail[3] = 
+		iSampleAvail[4] = 1;
+	}
+	if ( pNeighAvail->iRightTopAvail && IS_INTRA( pNeighAvail->iRightTopType ) ) //top_right
+	{
+		iSampleAvail[5] = 1;
+	}
+
+	uiNeighAvail = (iSampleAvail[6]<<2) | (iSampleAvail[0]<<1) | (iSampleAvail[1]);
+
+	for(i = 0; i < 16; i++)
+	{
+		const int32_t kiPrevIntra4x4PredMode = BsGetOneBit(pBs);//1bit
+		const int32_t kiPredMode = PredIntra4x4Mode(pIntraPredMode, i);
+
+		int8_t iBestMode;
+		if (kiPrevIntra4x4PredMode) 
+		{
+			iBestMode = kiPredMode;
+		}
+		else //kPrevIntra4x4PredMode == 0
+		{
+			const int32_t kiRemIntra4x4PredMode = BsGetBits(pBs, 3);//3bits				
+			if (kiRemIntra4x4PredMode < kiPredMode) 
+			{
+				iBestMode = kiRemIntra4x4PredMode;
+			}
+			else 
+			{
+				iBestMode = kiRemIntra4x4PredMode + 1;
+			}
+		}
+
+		iFinalMode = CheckIntra4x4PredMode(&iSampleAvail[0], &iBestMode, i);
+		if (iFinalMode < 0)
+		{
+			return ERR_INFO_INVALID_I4x4_PRED_MODE;
+		}
+
+		pCurDqLayer->pIntra4x4FinalMode[iMbXy][g_kuiScan4[i]] = iFinalMode;
+
+		pIntraPredMode[g_kuiScan8[i]] = iBestMode;
+
+		iSampleAvail[g_kuiCache30ScanIdx[i]] = 1;
+	}
+	ST32(&pCurDqLayer->pIntraPredMode[iMbXy][0], LD32(&pIntraPredMode[1 + 8 * 4]));
+	pCurDqLayer->pIntraPredMode[iMbXy][4] = pIntraPredMode[4 + 8 * 1];
+	pCurDqLayer->pIntraPredMode[iMbXy][5] = pIntraPredMode[4 + 8 * 2];
+	pCurDqLayer->pIntraPredMode[iMbXy][6] = pIntraPredMode[4 + 8 * 3];
+
+	pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe(pBs);
+	if (-1 == pCurDqLayer->pChromaPredMode[iMbXy] || CheckIntraChromaPredMode(uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy]))
+	{
+		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+	}
+
+	return 0;
+}
+
+int32_t ParseIntra16x16ModeConstrain0(PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer)
+{
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	uint8_t uiNeighAvail = 0; //0x07 = 0 1 1 1, means left, top-left, top avail or not. (1: avail, 0: unavail)
+
+	if ( pNeighAvail->iLeftAvail ) 
+	{
+		uiNeighAvail = (1<<2);
+	}
+	if ( pNeighAvail->iLeftTopAvail ) 
+	{
+		uiNeighAvail |= (1<<1);
+	}
+	if ( pNeighAvail->iTopAvail ) 
+	{
+		uiNeighAvail |= 1;
+	}
+
+	if (CheckIntra16x16PredMode(uiNeighAvail, &pCurDqLayer->pIntraPredMode[iMbXy][7])) //invalid iPredMode, must stop decoding
+	{
+		return ERR_INFO_INVALID_I16x16_PRED_MODE;
+	}
+	pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe(pBs);
+
+	if (-1 == pCurDqLayer->pChromaPredMode[iMbXy] || CheckIntraChromaPredMode(uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy]))
+	{
+		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+	}
+
+	return 0;
+}
+
+int32_t ParseIntra16x16ModeConstrain1(PNeighAvail pNeighAvail, PBitStringAux pBs, PDqLayer pCurDqLayer)
+{
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	uint8_t uiNeighAvail = 0; //0x07 = 0 1 1 1, means left, top-left, top avail or not. (1: avail, 0: unavail)
+
+	if ( pNeighAvail->iLeftAvail && IS_INTRA( pNeighAvail->iLeftType ) ) 
+	{
+		uiNeighAvail = (1<<2);
+	}
+	if ( pNeighAvail->iLeftTopAvail && IS_INTRA( pNeighAvail->iLeftTopType ) ) 
+	{
+		uiNeighAvail |= (1<<1);
+	}
+	if ( pNeighAvail->iTopAvail && IS_INTRA( pNeighAvail->iTopType ) ) 
+	{
+		uiNeighAvail |= 1;
+	}
+
+	if (CheckIntra16x16PredMode(uiNeighAvail, &pCurDqLayer->pIntraPredMode[iMbXy][7])) //invalid iPredMode, must stop decoding
+	{
+		return ERR_INFO_INVALID_I16x16_PRED_MODE;
+	}
+	pCurDqLayer->pChromaPredMode[iMbXy] = BsGetUe(pBs);
+
+	if (-1 == pCurDqLayer->pChromaPredMode[iMbXy] || CheckIntraChromaPredMode(uiNeighAvail, &pCurDqLayer->pChromaPredMode[iMbXy]))
+	{
+		return ERR_INFO_INVALID_I_CHROMA_PRED_MODE;
+	}
+
+	return 0;
+}
+
+int32_t ParseInterInfo(PWelsDecoderContext pCtx, int16_t iMvArray[LIST_A][30][MV_A], int8_t iRefIdxArray[LIST_A][30], PBitStringAux pBs)
+{
+	PSlice pSlice				= &pCtx->pCurDqLayer->sLayerInfo.sSliceInLayer;
+	PSliceHeader pSliceHeader	= &pSlice->sSliceHeaderExt.sSliceHeader;
+	int32_t iNumRefFrames		= pSliceHeader->pSps->iNumRefFrames; 
+	int32_t iRefCount[2];
+	PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
+	int32_t i, j;
+	int32_t iMbXy = pCurDqLayer->iMbXyIndex;
+	int32_t iMotionPredFlag[4];
+	int16_t iMv[2] = {0};
+
+	iMotionPredFlag[0] = iMotionPredFlag[1] = iMotionPredFlag[2] = iMotionPredFlag[3] = pSlice->sSliceHeaderExt.bDefaultMotionPredFlag;
+	iRefCount[0] = pSliceHeader->uiRefCount[0];
+	iRefCount[1] = pSliceHeader->uiRefCount[1];
+
+	switch( pCurDqLayer->pMbType[iMbXy] )
+	{
+	case MB_TYPE_16x16:
+		{
+			int8_t iRefIdx = 0;
+			if(pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag)
+			{
+				iMotionPredFlag[0] = BsGetOneBit(pBs);			
+			}
+			if (iMotionPredFlag[0] == 0)
+			{
+				iRefIdx = BsGetTe0(pBs, iRefCount[0]);
+				if (iRefIdx < 0 || iRefIdx >= iNumRefFrames) //error ref_idx
+				{ 
+					return ERR_INFO_INVALID_REF_INDEX;
+				}
+			}
+			else
+            {
+                WelsLog( pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n" );
+                return GENERATE_ERROR_NO(ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
+            }
+			PredMv(iMvArray, iRefIdxArray, 0, 4, iRefIdx, iMv);
+
+			iMv[0] += BsGetSe(pBs);
+			iMv[1] += BsGetSe(pBs);
+			
+			UpdateP16x16MotionInfo(pCurDqLayer, iRefIdx, iMv);
+		}
+		break;
+	case MB_TYPE_16x8:
+        {
+            int32_t iRefIdx[2];
+		    for (i = 0; i < 2; i++) 
+		    {
+			    if(pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag)
+			    {		
+				    iMotionPredFlag[i] = BsGetOneBit(pBs);		
+			    }
+		    }
+    		
+		    for (i = 0; i < 2; i++) 
+		    {
+                if( iMotionPredFlag[i] )
+                {
+                    WelsLog( pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n" );
+                    return GENERATE_ERROR_NO(ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
+                }
+			    iRefIdx[i] = BsGetTe0(pBs, iRefCount[0]);
+			    if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) //error ref_idx
+			    { 
+				    return ERR_INFO_INVALID_REF_INDEX;
+			    }
+		    }
+		    for (i = 0; i < 2; i++) 
+		    {
+			    PredInter16x8Mv(iMvArray, iRefIdxArray, i<<3, iRefIdx[i], iMv);
+
+			    iMv[0] += BsGetSe(pBs);
+			    iMv[1] += BsGetSe(pBs);
+
+			    UpdateP16x8MotionInfo(pCurDqLayer, iMvArray, iRefIdxArray, i<<3, iRefIdx[i], iMv);
+		    }
+        }
+		break;
+	case MB_TYPE_8x16:
+        {
+            int32_t iRefIdx[2];
+		    for (i = 0; i < 2; i++) 
+		    {
+			    if(pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag)
+			    {
+				    iMotionPredFlag[i] = BsGetOneBit(pBs);	
+			    }
+		    }
+    		
+		    for (i = 0; i < 2; i++) 
+		    {
+			    if (iMotionPredFlag[i] == 0)
+			    {
+				    iRefIdx[i] = BsGetTe0(pBs, iRefCount[0]);
+				    if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) //error ref_idx
+				    { 
+					    return ERR_INFO_INVALID_REF_INDEX;
+				    }
+			    }
+			    else
+			    {
+                    WelsLog( pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n" );
+                    return GENERATE_ERROR_NO(ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
+			    }
+    			
+		    }
+		    for (i = 0; i < 2; i++) 
+		    {
+			    PredInter8x16Mv( iMvArray, iRefIdxArray, i<<2, iRefIdx[i], iMv);
+    			
+			    iMv[0] += BsGetSe(pBs); 
+			    iMv[1] += BsGetSe(pBs);
+
+			    UpdateP8x16MotionInfo(pCurDqLayer, iMvArray, iRefIdxArray, i<<2, iRefIdx[i], iMv);
+		    }
+        }
+		break;
+	case MB_TYPE_8x8:
+	case MB_TYPE_8x8_REF0:
+		{
+			int8_t iRefIdx[4] = {0}, iSubPartCount[4], iPartWidth[4];
+			uint32_t uiSubMbType;
+
+			if ( MB_TYPE_8x8_REF0 == pCurDqLayer->pMbType[iMbXy])
+			{
+				iRefCount[0]	=
+				iRefCount[1]	= 1;
+			}
+
+			//uiSubMbType, partition
+			for (i = 0; i < 4; i++) 
+			{
+				uiSubMbType = BsGetUe(pBs);
+				if (uiSubMbType >= 4) //invalid uiSubMbType
+				{
+					return ERR_INFO_INVALID_SUB_MB_TYPE;
+				}
+				pCurDqLayer->pSubMbType[iMbXy][i] = g_ksInterSubMbTypeInfo[uiSubMbType].iType;
+				iSubPartCount[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartCount;
+				iPartWidth[i] = g_ksInterSubMbTypeInfo[uiSubMbType].iPartWidth;
+			}
+
+			if(pSlice->sSliceHeaderExt.bAdaptiveMotionPredFlag)
+			{
+				for(i=0; i<4; i++)
+				{			
+					iMotionPredFlag[i] = BsGetOneBit(pBs);				
+				}
+			}
+			
+			//iRefIdxArray
+			if (MB_TYPE_8x8_REF0 == pCurDqLayer->pMbType[iMbXy])
+			{
+				memset(pCurDqLayer->pRefIndex[0][iMbXy], 0, 16);
+			}
+			else
+			{
+				for (i = 0; i < 4; i++) 
+				{
+					int16_t iIndex8 = i << 2;
+					uint8_t uiScan4Idx = g_kuiScan4[iIndex8];
+
+					if (iMotionPredFlag[i] == 0)
+					{
+						iRefIdx[i] = BsGetTe0(pBs, iRefCount[0]);
+						if (iRefIdx[i] < 0 || iRefIdx[i] >= iNumRefFrames) //error ref_idx 
+						{
+							return ERR_INFO_INVALID_REF_INDEX;
+						}
+
+						pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx  ] = pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx+1] =
+						pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx+4] = pCurDqLayer->pRefIndex[0][iMbXy][uiScan4Idx+5] = iRefIdx[i];
+					}
+					else
+                    {
+                        WelsLog( pCtx, WELS_LOG_WARNING, "inter parse: iMotionPredFlag = 1 not supported. \n" );
+                        return GENERATE_ERROR_NO(ERR_LEVEL_MB_DATA, ERR_INFO_UNSUPPORTED_ILP);
+                    }
+				}
+			}
+			
+			//gain mv and update mv cache
+			for (i = 0; i < 4; i++) 
+			{
+				int8_t iPartCount = iSubPartCount[i];
+				uint32_t uiSubMbType = pCurDqLayer->pSubMbType[iMbXy][i];
+				int16_t iMv[2], iPartIdx, iBlockWidth = iPartWidth[i], iIdx = i << 2;				
+				uint8_t uiScan4Idx, uiCacheIdx;				
+				
+				uint8_t uiIdx4Cache = g_kuiCache30ScanIdx[iIdx];
+				
+				iRefIdxArray[0][uiIdx4Cache  ] = iRefIdxArray[0][uiIdx4Cache+1] =
+				iRefIdxArray[0][uiIdx4Cache+6] = iRefIdxArray[0][uiIdx4Cache+7] = iRefIdx[i];
+					
+				for (j = 0; j < iPartCount; j++) 
+				{
+					iPartIdx = iIdx + j * iBlockWidth;
+					uiScan4Idx = g_kuiScan4[iPartIdx];
+					uiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+					PredMv(iMvArray, iRefIdxArray, iPartIdx, iBlockWidth, iRefIdx[i], iMv);
+
+					iMv[0] += BsGetSe(pBs); 
+					iMv[1] += BsGetSe(pBs);
+					
+					if (SUB_MB_TYPE_8x8 == uiSubMbType) 
+					{
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx], LD32(iMv));
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+1], LD32(iMv));
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+4], LD32(iMv));
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+5], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx  ], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx+1], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx+6], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx+7], LD32(iMv));
+					}
+					else if (SUB_MB_TYPE_8x4 == uiSubMbType) 
+					{
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32(iMv));
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+1], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx  ], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx+1], LD32(iMv));
+					}
+					else if (SUB_MB_TYPE_4x8 == uiSubMbType) 
+					{
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32(iMv));
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx+4], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx  ], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx+6], LD32(iMv));
+					}
+					else //SUB_MB_TYPE_4x4 == uiSubMbType
+					{
+						ST32(pCurDqLayer->pMv[0][iMbXy][uiScan4Idx  ], LD32(iMv));
+						ST32(iMvArray[0][uiCacheIdx  ], LD32(iMv));
+					}
+				}
+			}
+		}
+		break;
+	default:
+		break;
+	}	
+
+	return 0;
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/pic_queue.cpp
@@ -1,0 +1,166 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	pic_queue.c
+ *
+ * \brief	Recycled piture queue implementation
+ *
+ * \date	03/13/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include "pic_queue.h"
+#include "wels_const.h"
+#include "utils.h"
+#include "macros.h"
+#include "decoder_context.h"
+#include "codec_def.h"
+#include "mem_align.h"
+
+namespace WelsDec {
+
+void_t FreePicture( PPicture pPic );
+
+
+///////////////////////////////////Recycled queue management for pictures///////////////////////////////////
+/*	 ______________________________________
+  -->| P0 | P1 | P2 | P3 | P4 | .. | Pn-1 |-->
+	 -------------------------------------- 
+ *
+ *	How does it work?
+ *	node <- next; ++ next;
+ *
+*/
+
+
+
+PPicture AllocPicture( PWelsDecoderContext pCtx, const int32_t kiPicWidth, const int32_t kiPicHeight )
+{
+	PPicture pPic = NULL;
+	int32_t iPicWidth = 0;
+	int32_t iPicHeight= 0;
+
+	int32_t iPicChromaWidth	= 0;
+	int32_t iPicChromaHeight	= 0;
+	int32_t iLumaSize			= 0;
+	int32_t iChromaSize			= 0;	
+
+	pPic	= (PPicture) WelsMalloc( sizeof(SPicture), "PPicture" );	
+	WELS_VERIFY_RETURN_IF( NULL, NULL == pPic );
+	
+	memset(pPic, 0, sizeof(SPicture) );
+	
+	iPicWidth = WELS_ALIGN(kiPicWidth + (PADDING_LENGTH<<1), PICTURE_RESOLUTION_ALIGNMENT);
+	iPicHeight = WELS_ALIGN(kiPicHeight + (PADDING_LENGTH<<1), PICTURE_RESOLUTION_ALIGNMENT);
+	iPicChromaWidth	= iPicWidth >> 1;
+	iPicChromaHeight	= iPicHeight >> 1;
+	
+	iLumaSize	= iPicWidth * iPicHeight;
+	iChromaSize	= iPicChromaWidth * iPicChromaHeight;
+	if(pCtx->iDecoderMode == SW_MODE)
+	{
+		pPic->pBuffer[0]	= static_cast<uint8_t*> (WelsMalloc(	iLumaSize /* luma */
+								  + (iChromaSize << 1) /* Cb,Cr */, "_pic->buffer[0]" ) );
+
+		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->pBuffer[0], FreePicture(pPic) );
+		pPic->iLinesize[0] = iPicWidth;
+		pPic->iLinesize[1] = pPic->iLinesize[2] = iPicChromaWidth;
+		pPic->pBuffer[1]	= pPic->pBuffer[0] + iLumaSize;
+		pPic->pBuffer[2]	= pPic->pBuffer[1] + iChromaSize;
+		pPic->pData[0]	= pPic->pBuffer[0] + (1+pPic->iLinesize[0]) * PADDING_LENGTH;
+		pPic->pData[1]	= pPic->pBuffer[1] + /*WELS_ALIGN*/( ((1+pPic->iLinesize[1]) * PADDING_LENGTH) >> 1 );
+		pPic->pData[2]	= pPic->pBuffer[2] + /*WELS_ALIGN*/( ((1+pPic->iLinesize[2]) * PADDING_LENGTH) >> 1 );
+	}	
+
+
+
+	pPic->iPlanes		= 3;	// yv12 in default
+	pPic->iWidthInPixel	= kiPicWidth;
+	pPic->iHeightInPixel= kiPicHeight;
+	pPic->iFrameNum		= -1;
+	pPic->bAvailableFlag= true;
+
+	return pPic;
+}
+
+void_t FreePicture( PPicture pPic )
+{
+	if ( NULL != pPic )
+	{
+
+		if ( pPic->pBuffer[0] )
+		{
+			WelsFree( pPic->pBuffer[0], "pPic->pBuffer[0]" );
+		}		
+
+		WelsFree( pPic, "pPic" );
+
+		pPic = NULL;
+	}
+}
+PPicture PrefetchPic( PPicBuff pPicBuf )
+{
+	int32_t iPicIdx = 0;
+	PPicture pPic  = NULL;
+
+	if (pPicBuf->iCapacity == 0)
+	{
+		return NULL;
+	}
+
+	for ( iPicIdx = pPicBuf->iCurrentIdx+1; iPicIdx<pPicBuf->iCapacity ;++iPicIdx)
+	{
+		if (pPicBuf->ppPic[iPicIdx] !=NULL && pPicBuf->ppPic[iPicIdx]->bAvailableFlag && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef)
+		{
+			pPic = pPicBuf->ppPic[iPicIdx];
+			break;
+		}
+	}
+	if (pPic !=NULL)
+	{
+		pPicBuf->iCurrentIdx = iPicIdx;
+		return pPic;
+	}
+	for ( iPicIdx = 0 ; iPicIdx<pPicBuf->iCurrentIdx ;++iPicIdx)
+	{
+		if (pPicBuf->ppPic[iPicIdx] !=NULL && pPicBuf->ppPic[iPicIdx]->bAvailableFlag && !pPicBuf->ppPic[iPicIdx]->bUsedAsRef)
+		{
+			pPic = pPicBuf->ppPic[iPicIdx];
+			break;
+		}
+	}
+	
+	pPicBuf->iCurrentIdx = iPicIdx;
+	return pPic;
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/core/src/rec_mb.cpp
@@ -1,0 +1,589 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	rec_mb.c
+ *
+ * \brief	implementation for all macroblock decoding process after mb syntax parsing and residual decoding with cavlc.
+ *
+ * \date	3/18/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <memory.h>
+
+#include "macros.h"
+
+#include "rec_mb.h"
+#include "parse_mb_syn_cavlc.h"
+#include "get_intra_predictor.h"
+#include "decode_mb_aux.h"
+#include "decode_slice.h"
+
+namespace WelsDec {
+
+void_t WelsFillRecNeededMbInfo(PWelsDecoderContext pCtx, bool_t bOutput, PDqLayer pCurLayer)
+{
+	PPicture pCurPic = pCtx->pDec;
+	int32_t iLumaStride   = pCurPic->iLinesize[0];
+	int32_t iChromaStride = pCurPic->iLinesize[1];
+	int32_t iMbX = pCurLayer->iMbX;
+	int32_t iMbY = pCurLayer->iMbY;
+	
+	pCurLayer->iLumaStride= iLumaStride;
+	pCurLayer->iChromaStride= iChromaStride;
+	
+	if(bOutput)
+	{
+		pCurLayer->pPred[0] = pCurPic->pData[0] + ((iMbY * iLumaStride + iMbX)<<4);
+		pCurLayer->pPred[1] = pCurPic->pData[1] + ((iMbY * iChromaStride + iMbX)<<3);
+		pCurLayer->pPred[2] = pCurPic->pData[2] + ((iMbY * iChromaStride + iMbX)<<3);
+	}
+}
+
+int32_t RecI4x4Mb(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
+{
+	RecI4x4Luma(iMBXY, pCtx, pScoeffLevel, pDqLayer);
+	RecI4x4Chroma( iMBXY, pCtx, pScoeffLevel, pDqLayer);
+	return ERR_NONE;
+}
+
+int32_t RecI4x4Luma(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
+{
+	/*****get local variable from outer variable********/
+	/*prediction info*/
+	uint8_t *pPred = pDqLayer->pPred[0];
+	
+	int32_t iLumaStride = pDqLayer->iLumaStride;
+	int32_t *pBlockOffset = pCtx->iDecBlockOffsetArray;
+	PGetIntraPredFunc *pGetI4x4LumaPredFunc = pCtx->pGetI4x4LumaPredFunc;	
+	
+	int8_t *pIntra4x4PredMode = pDqLayer->pIntra4x4FinalMode[iMBXY];
+	int16_t *pRS = pScoeffLevel;
+	/*itransform info*/
+	PIdctResAddPredFunc	pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
+	
+	
+	/*************local variable********************/
+	uint8_t i = 0;
+	
+	/*************real process*********************/
+	for(i=0; i<16; i++)
+	{
+		
+		uint8_t *pPredI4x4 = pPred + pBlockOffset[i];
+		uint8_t uiMode= pIntra4x4PredMode[g_kuiScan4[i]];
+		
+		pGetI4x4LumaPredFunc[uiMode](pPredI4x4, iLumaStride);
+	
+		if ( pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[i]] )
+		{	
+			int16_t *pRSI4x4 = &pRS[i<<4];
+			pIdctResAddPredFunc(pPredI4x4, iLumaStride, pRSI4x4);
+		}
+	}	
+	
+	return ERR_NONE;
+}
+
+
+int32_t RecI4x4Chroma(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
+{
+	int32_t iChromaStride = pCtx->pCurDqLayer->iCsStride[1];
+
+	int8_t iChromaPredMode = pDqLayer->pChromaPredMode[iMBXY];
+	
+	PGetIntraPredFunc *pGetIChromaPredFunc = pCtx->pGetIChromaPredFunc;
+
+	uint8_t *pPred = pDqLayer->pPred[1];
+
+	pGetIChromaPredFunc[iChromaPredMode](pPred, iChromaStride);
+	pPred = pDqLayer->pPred[2];
+	pGetIChromaPredFunc[iChromaPredMode](pPred, iChromaStride);
+	
+	RecChroma(iMBXY, pCtx, pScoeffLevel, pDqLayer);
+
+	return ERR_NONE;
+}
+
+
+int32_t RecI16x16Mb(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
+{
+	/*decoder use, encoder no use*/
+	int8_t iI16x16PredMode = pDqLayer->pIntraPredMode[iMBXY][7];
+	int8_t iChromaPredMode = pDqLayer->pChromaPredMode[iMBXY];
+	PGetIntraPredFunc *pGetIChromaPredFunc = pCtx->pGetIChromaPredFunc;
+	PGetIntraPredFunc *pGetI16x16LumaPredFunc = pCtx->pGetI16x16LumaPredFunc;
+	int32_t iUVStride = pCtx->pCurDqLayer->iCsStride[1];
+	
+	/*common use by decoder&encoder*/
+	int32_t iYStride = pDqLayer->iLumaStride;
+	int32_t *pBlockOffset = pCtx->iDecBlockOffsetArray;
+	int16_t *pRS = pScoeffLevel;
+	
+	uint8_t *pPred = pDqLayer->pPred[0];
+	
+	PIdctResAddPredFunc pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;
+		
+	uint8_t i = 0;
+	
+	/*decode i16x16 y*/
+	pGetI16x16LumaPredFunc[iI16x16PredMode](pPred, iYStride);
+	
+	/*1 mb is divided 16 4x4_block to idct*/
+	for(i=0; i<16; i++)
+	{
+		int16_t *pRSI4x4 = pRS + (i<<4);
+		uint8_t *pPredI4x4 = pPred + pBlockOffset[i];
+		
+		if ( pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[i]] || pRSI4x4[0] )
+		{
+			pIdctResAddPredFunc(pPredI4x4, iYStride, pRSI4x4);
+		}
+	}
+	
+	/*decode intra mb cb&cr*/
+	pPred = pDqLayer->pPred[1];
+	pGetIChromaPredFunc[iChromaPredMode](pPred, iUVStride);
+	pPred = pDqLayer->pPred[2];
+	pGetIChromaPredFunc[iChromaPredMode](pPred, iUVStride);
+	RecChroma(iMBXY, pCtx, pScoeffLevel,pDqLayer);
+	
+	return ERR_NONE;
+}
+
+typedef struct TagMCRefMember {
+	uint8_t* pDstY;
+	uint8_t* pDstU;
+	uint8_t* pDstV;
+
+	uint8_t* pSrcY;
+	uint8_t* pSrcU;
+	uint8_t* pSrcV;
+
+	int32_t iSrcLineLuma;
+	int32_t iSrcLineChroma;
+
+	int32_t iDstLineLuma;
+	int32_t iDstLineChroma;
+
+	int32_t iPicWidth;
+	int32_t iPicHeight;
+}sMCRefMember;
+//according to current 8*8 block ref_index to gain reference picture
+static inline void_t GetRefPic(sMCRefMember* pMCRefMem, PWelsDecoderContext pCtx, int8_t* pRefIdxList, int32_t iIndex)
+{
+	PPicture pRefPic;
+
+	int8_t iRefIdx = pRefIdxList[iIndex];
+	pRefPic = pCtx->sRefPic.pRefList[LIST_0][iRefIdx];
+
+	pMCRefMem->iSrcLineLuma   = pRefPic->iLinesize[0];
+	pMCRefMem->iSrcLineChroma = pRefPic->iLinesize[1];	
+
+	pMCRefMem->pSrcY = pRefPic->pData[0];
+	pMCRefMem->pSrcU = pRefPic->pData[1];
+	pMCRefMem->pSrcV = pRefPic->pData[2];
+}
+
+
+#ifndef MC_FLOW_SIMPLE_JUDGE
+#define MC_FLOW_SIMPLE_JUDGE 1
+#endif //MC_FLOW_SIMPLE_JUDGE
+static inline void_t BaseMC(sMCRefMember* pMCRefMem, int32_t iXOffset, int32_t iYOffset, SMcFunc* pMCFunc,
+						   int32_t iBlkWidth, int32_t iBlkHeight, int16_t iMVs[2])
+{		
+	int32_t iExpandWidth = PADDING_LENGTH;
+	int32_t	iExpandHeight = PADDING_LENGTH;
+	
+
+	int16_t iMVX = iMVs[0] >> 2;
+	int16_t iMVY = iMVs[1] >> 2;
+	int32_t iMVOffsetLuma = iMVX + iMVY * pMCRefMem->iSrcLineLuma;
+	int32_t iMVOffsetChroma = (iMVX>>1) + (iMVY>>1) * pMCRefMem->iSrcLineChroma;
+
+	int32_t iFullMVx = (iXOffset << 2) + iMVs[0]; //quarter pixel
+	int32_t iFullMVy = (iYOffset << 2) + iMVs[1];
+	int32_t iIntMVx = iFullMVx >> 2;//integer pixel
+	int32_t iIntMVy = iFullMVy >> 2;
+
+	int32_t iSrcPixOffsetLuma = iXOffset + iYOffset * pMCRefMem->iSrcLineLuma;
+	int32_t iSrcPixOffsetChroma = (iXOffset>>1) + (iYOffset>>1) * pMCRefMem->iSrcLineChroma;
+
+	int32_t iBlkWidthChroma = iBlkWidth >> 1;
+	int32_t iBlkHeightChroma = iBlkHeight >> 1;
+	int32_t iPicWidthChroma = pMCRefMem->iPicWidth >> 1;
+	int32_t iPicHeightChroma = pMCRefMem->iPicHeight >> 1;
+
+	//the offset only for luma padding if MV violation as there was 5-tap (-2, -1, 0, 1, 2) filter for luma (horizon and vertical)
+	int32_t iPadOffset = 2 + (pMCRefMem->iSrcLineLuma << 1); //(-2, -2) pixel location as the starting point
+
+    uint8_t* pSrcY = pMCRefMem->pSrcY + iSrcPixOffsetLuma;
+	uint8_t* pSrcU = pMCRefMem->pSrcU + iSrcPixOffsetChroma;
+    uint8_t* pSrcV = pMCRefMem->pSrcV + iSrcPixOffsetChroma;
+	uint8_t* pDstY = pMCRefMem->pDstY;
+	uint8_t* pDstU = pMCRefMem->pDstU;
+	uint8_t* pDstV = pMCRefMem->pDstV;
+	bool_t bExpand = false;
+
+	FORCE_STACK_ALIGN_1D( uint8_t, uiExpandBuf, (PADDING_LENGTH+6)*(PADDING_LENGTH+6), 16 );
+	
+	if (iFullMVx & 0x07)
+	{
+		iExpandWidth -= 3;
+	}		
+	if (iFullMVy & 0x07)
+	{
+		iExpandHeight -= 3;
+	}
+
+#ifdef MC_FLOW_SIMPLE_JUDGE
+	if (iIntMVx < -iExpandWidth || 
+		iIntMVy < -iExpandHeight || 
+		iIntMVx + iBlkWidth > pMCRefMem->iPicWidth - 1 + iExpandWidth || 
+		iIntMVy + iBlkHeight > pMCRefMem->iPicHeight - 1 + iExpandHeight)
+#else
+	if (iIntMVx < -iExpandWidth || 
+		iIntMVy < -iExpandHeight || 
+		iIntMVx + PADDING_LENGTH > pMCRefMem->iPicWidth + iExpandWidth || 
+		iIntMVy + PADDING_LENGTH > pMCRefMem->iPicHeight + iExpandHeight)
+#endif
+	{
+		FillBufForMc(uiExpandBuf, 21, pSrcY, pMCRefMem->iSrcLineLuma, iMVOffsetLuma-iPadOffset, 
+			            iBlkWidth+5, iBlkHeight+5, iIntMVx-2, iIntMVy-2, pMCRefMem->iPicWidth, pMCRefMem->iPicHeight);
+		pMCFunc->pMcLumaFunc(uiExpandBuf+44, 21, pDstY, pMCRefMem->iDstLineLuma, iFullMVx, iFullMVy, iBlkWidth, iBlkHeight);//44=2+2*21
+		bExpand = true;
+	}
+	else
+	{
+		pSrcY += iMVOffsetLuma;
+		pMCFunc->pMcLumaFunc(pSrcY, pMCRefMem->iSrcLineLuma, pDstY, pMCRefMem->iDstLineLuma, iFullMVx, iFullMVy, iBlkWidth, iBlkHeight);
+	}
+
+	if (bExpand)
+	{
+		FillBufForMc(uiExpandBuf, 21, pSrcU, pMCRefMem->iSrcLineChroma, iMVOffsetChroma, iBlkWidthChroma+1, iBlkHeightChroma+1, iFullMVx>>3, iFullMVy>>3, iPicWidthChroma, iPicHeightChroma);
+		pMCFunc->pMcChromaFunc(uiExpandBuf, 21, pDstU, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma, iBlkHeightChroma);
+		
+		FillBufForMc(uiExpandBuf, 21, pSrcV, pMCRefMem->iSrcLineChroma, iMVOffsetChroma, iBlkWidthChroma+1, iBlkHeightChroma+1, iFullMVx>>3, iFullMVy>>3, iPicWidthChroma, iPicHeightChroma);
+		pMCFunc->pMcChromaFunc(uiExpandBuf, 21, pDstV, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma, iBlkHeightChroma);
+	}
+	else
+	{
+		pSrcU += iMVOffsetChroma;
+		pSrcV += iMVOffsetChroma;
+		pMCFunc->pMcChromaFunc(pSrcU, pMCRefMem->iSrcLineChroma, pDstU, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma, iBlkHeightChroma);
+		pMCFunc->pMcChromaFunc(pSrcV, pMCRefMem->iSrcLineChroma, pDstV, pMCRefMem->iDstLineChroma, iFullMVx, iFullMVy, iBlkWidthChroma, iBlkHeightChroma);
+	}
+}
+
+void_t GetInterPred(uint8_t *pPredY, uint8_t *pPredCb, uint8_t *pPredCr, PWelsDecoderContext pCtx)
+{
+	sMCRefMember pMCRefMem;
+	PDqLayer pCurDqLayer = pCtx->pCurDqLayer;
+	SMcFunc* pMCFunc = &pCtx->sMcFunc;
+
+	int32_t iMBXY = pCurDqLayer->iMbXyIndex;
+
+	int16_t iMVs[2] = {0};
+ 	
+	int32_t iMBType = pCurDqLayer->pMbType[iMBXY];
+
+	int32_t iMBOffsetX = pCurDqLayer->iMbX << 4;
+ 	int32_t iMBOffsetY = pCurDqLayer->iMbY << 4;
+
+	int32_t iDstLineLuma   = pCtx->pDec->iLinesize[0];
+	int32_t iDstLineChroma = pCtx->pDec->iLinesize[1];
+	
+	int32_t iBlk8X, iBlk8Y, iBlk4X, iBlk4Y, i, j, iIIdx, iJIdx;
+
+	pMCRefMem.iPicWidth = (pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iMbWidth<<4);
+	pMCRefMem.iPicHeight = (pCurDqLayer->sLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.iMbHeight<<4);
+
+	pMCRefMem.pDstY = pPredY;
+	pMCRefMem.pDstU = pPredCb;
+	pMCRefMem.pDstV = pPredCr;
+
+	pMCRefMem.iDstLineLuma   = iDstLineLuma;
+	pMCRefMem.iDstLineChroma = iDstLineChroma;
+	switch(iMBType)
+	{
+	case MB_TYPE_SKIP:
+ 	case MB_TYPE_16x16:
+		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
+		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
+		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0 );
+		BaseMC(&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 16, iMVs);
+		break;
+	case MB_TYPE_16x8:
+		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
+		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
+		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0 );
+		BaseMC(&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 16, 8, iMVs);
+
+		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][8][0];
+		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][8][1];
+		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 8 );
+		pMCRefMem.pDstY = pPredY  + (iDstLineLuma << 3);
+		pMCRefMem.pDstU = pPredCb + (iDstLineChroma << 2);
+		pMCRefMem.pDstV = pPredCr + (iDstLineChroma << 2);
+		BaseMC(&pMCRefMem, iMBOffsetX, iMBOffsetY+8, pMCFunc, 16, 8, iMVs);
+		break;
+	case MB_TYPE_8x16:
+		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][0][0];
+		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][0][1];
+		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 0 );
+		BaseMC(&pMCRefMem, iMBOffsetX, iMBOffsetY, pMCFunc, 8, 16, iMVs);
+
+		iMVs[0] = pCurDqLayer->pMv[0][iMBXY][2][0];
+		iMVs[1] = pCurDqLayer->pMv[0][iMBXY][2][1];
+		GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], 2 );
+		pMCRefMem.pDstY = pPredY + 8;
+		pMCRefMem.pDstU = pPredCb + 4;
+		pMCRefMem.pDstV = pPredCr + 4;
+		BaseMC(&pMCRefMem, iMBOffsetX+8, iMBOffsetY, pMCFunc, 8, 16, iMVs);
+		break;
+	case MB_TYPE_8x8:
+	case MB_TYPE_8x8_REF0:
+		{
+			uint32_t iSubMBType;
+			int32_t iXOffset, iYOffset;
+			uint8_t *pDstY, *pDstU, *pDstV;
+			for (i = 0; i < 4; i++)
+			{
+				iSubMBType = pCurDqLayer->pSubMbType[iMBXY][i];
+				iBlk8X = (i&1) << 3;
+				iBlk8Y = (i>>1) << 3;				
+				iXOffset = iMBOffsetX + iBlk8X;
+				iYOffset = iMBOffsetY + iBlk8Y;
+
+				iIIdx = ((i>>1)<<3) +((i&1)<<1);
+				GetRefPic( &pMCRefMem, pCtx, pCurDqLayer->pRefIndex[0][iMBXY], iIIdx );
+
+				pDstY = pPredY + iBlk8X + iBlk8Y * iDstLineLuma;
+				pDstU = pPredCb + (iBlk8X >> 1) + (iBlk8Y >> 1) * iDstLineChroma;
+				pDstV = pPredCr + (iBlk8X >> 1) + (iBlk8Y >> 1) * iDstLineChroma;
+				pMCRefMem.pDstY = pDstY;
+				pMCRefMem.pDstU = pDstU;
+				pMCRefMem.pDstV = pDstV;
+				switch(iSubMBType)
+				{
+				case SUB_MB_TYPE_8x8:
+					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
+					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
+					BaseMC( &pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 8, iMVs );					
+					break;					
+				case SUB_MB_TYPE_8x4:
+					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
+					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
+					BaseMC(&pMCRefMem, iXOffset, iYOffset, pMCFunc, 8, 4, iMVs);
+
+					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx+4][0];
+					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx+4][1];
+					pMCRefMem.pDstY += (iDstLineLuma << 2);
+					pMCRefMem.pDstU += (iDstLineChroma << 1);
+					pMCRefMem.pDstV += (iDstLineChroma << 1);
+					BaseMC(&pMCRefMem, iXOffset, iYOffset+4, pMCFunc, 8, 4, iMVs);
+					break;
+				case SUB_MB_TYPE_4x8:
+					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx][0];
+					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx][1];
+					BaseMC(&pMCRefMem, iXOffset, iYOffset, pMCFunc, 4, 8, iMVs);
+
+					iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx+1][0];
+					iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx+1][1];
+					pMCRefMem.pDstY += 4;
+					pMCRefMem.pDstU += 2;
+					pMCRefMem.pDstV += 2;
+					BaseMC(&pMCRefMem, iXOffset+4, iYOffset, pMCFunc, 4, 8, iMVs);
+					break;
+				case SUB_MB_TYPE_4x4:
+					{
+						for (j = 0; j < 4; j++)
+						{
+							int32_t iUVLineStride;
+							iJIdx = ((j>>1)<<2) + (j&1);
+
+							iBlk4X = (j&1) << 2;
+							iBlk4Y = (j>>1) << 2;
+
+							iUVLineStride = (iBlk4X >> 1) + (iBlk4Y >> 1) * iDstLineChroma; 
+							pMCRefMem.pDstY = pDstY + iBlk4X + iBlk4Y * iDstLineLuma;							
+							pMCRefMem.pDstU = pDstU + iUVLineStride;  
+							pMCRefMem.pDstV = pDstV + iUVLineStride;
+
+							iMVs[0] = pCurDqLayer->pMv[0][iMBXY][iIIdx+iJIdx][0];
+							iMVs[1] = pCurDqLayer->pMv[0][iMBXY][iIIdx+iJIdx][1];
+							BaseMC(&pMCRefMem, iXOffset+iBlk4X, iYOffset+iBlk4Y, pMCFunc, 4, 4, iMVs);
+						}
+					}
+					break;
+				default:
+					break;
+				}
+			}
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+int32_t RecChroma(int32_t iMBXY, PWelsDecoderContext pCtx, int16_t *pScoeffLevel, PDqLayer pDqLayer)
+{
+	int32_t iChromaStride = pCtx->pCurDqLayer->iCsStride[1];
+	PIdctResAddPredFunc pIdctResAddPredFunc = pCtx->pIdctResAddPredFunc;	
+
+	uint8_t i=0, j=0;
+	uint8_t uiCbpC = pDqLayer->pCbp[iMBXY] >> 4;
+	
+	if ( 1 == uiCbpC || 2 == uiCbpC )
+	{
+		WelsChromaDcIdct( pScoeffLevel + 256 );	// 256 = 16*16
+		WelsChromaDcIdct( pScoeffLevel + 320 );	// 256 = 16*16
+		for(i=0; i<2; i++)
+		{
+			int16_t *pRS = pScoeffLevel + 256 + (i << 6);	
+			uint8_t *pPred = pDqLayer->pPred[i+1];
+			int32_t *pBlockOffset = i==0 ? &pCtx->iDecBlockOffsetArray[16] : &pCtx->iDecBlockOffsetArray[20];
+			
+			/*1 chroma is divided 4 4x4_block to idct*/
+			for(j=0; j<4; j++)
+			{
+				int16_t *pRSI4x4 = &pRS[j<<4];
+				uint8_t *pPredI4x4 = pPred + pBlockOffset[j];
+				
+				if ( pDqLayer->pNzc[iMBXY][g_kuiMbNonZeroCountIdx[16+(i<<2)+j]] || pRSI4x4[0] )
+				{
+					pIdctResAddPredFunc(pPredI4x4, iChromaStride, pRSI4x4);
+				}
+			}
+		}
+	}
+	
+	return ERR_NONE;
+}
+
+void_t FillBufForMc(uint8_t *pBuf, int32_t iBufStride, uint8_t *pSrc, int32_t iSrcStride, int32_t iSrcOffset, 
+					 int32_t iBlockWidth, int32_t iBlockHeight, int32_t iSrcX, int32_t iSrcY, int32_t iPicWidth, int32_t iPicHeight)
+{
+    int32_t iY;
+    int32_t iStartY, iStartX, iEndY, iEndX;
+	int32_t iOffsetAdj = 0;
+	int32_t iAddrSrc, iAddrBuf;
+	int32_t iNum, iNum1;
+	uint8_t *pBufSrc, *pBufDst;
+	uint8_t *pBufSrc1, *pBufDst1;
+
+    if( iSrcY >= iPicHeight )
+	{
+        iOffsetAdj += ( iPicHeight - 1 - iSrcY ) * iSrcStride;
+        iSrcY = iPicHeight - 1;
+    }
+	else if( iSrcY <= -iBlockHeight )
+	{
+        iOffsetAdj += ( 1 - iBlockHeight - iSrcY ) * iSrcStride;
+        iSrcY = 1 - iBlockHeight;
+    }
+    if( iSrcX >= iPicWidth )
+	{
+        iOffsetAdj += ( iPicWidth - 1 - iSrcX );
+        iSrcX = iPicWidth - 1;
+    }
+	else if( iSrcX <= -iBlockWidth )
+	{
+        iOffsetAdj +=  ( 1 - iBlockWidth - iSrcX );
+        iSrcX = 1 - iBlockWidth;
+    }
+
+	iOffsetAdj += iSrcOffset;
+
+#define MAX(a,b) ((a) > (b) ? (a) : (b))	
+#define MIN(a,b) ((a) > (b) ? (b) : (a))
+
+    iStartY = MAX(0, -iSrcY);
+    iStartX = MAX(0, -iSrcX);
+    iEndY = MIN(iBlockHeight, iPicHeight - iSrcY);
+    iEndX = MIN(iBlockWidth, iPicWidth - iSrcX);
+	
+    // copy existing part
+	iAddrSrc = iStartX + iStartY * iSrcStride;
+	iAddrBuf = iStartX + iStartY * iBufStride;
+	iNum = iEndX - iStartX;
+    for( iY = iStartY; iY < iEndY; iY++ )
+	{
+		memcpy( pBuf + iAddrBuf, pSrc + iOffsetAdj + iAddrSrc, iNum );
+		iAddrSrc += iSrcStride;
+		iAddrBuf += iBufStride;
+    }
+	
+    //top
+	pBufSrc = pBuf + iStartX + iStartY * iBufStride;
+	pBufDst = pBuf + iStartX;
+	iNum = iEndX - iStartX;
+    for( iY = 0; iY < iStartY; iY++ )
+	{
+		memcpy( pBufDst, pBufSrc, iNum );
+		pBufDst += iBufStride;
+    }
+	
+    //bottom
+	pBufSrc = pBuf + iStartX + ( iEndY - 1 ) * iBufStride;
+	pBufDst = pBuf + iStartX + iEndY * iBufStride;
+    iNum = iEndX - iStartX;
+    for( iY = iEndY; iY < iBlockHeight; iY++ )
+	{
+		memcpy( pBufDst, pBufSrc, iNum );
+		pBufDst += iBufStride;
+    }
+	
+	
+	pBufSrc = pBuf + iStartX;
+	pBufDst = pBuf;
+	iNum = iStartX;
+
+	pBufSrc1 = pBuf + iEndX - 1;
+	pBufDst1 = pBuf + iEndX;
+	iNum1 = iBlockWidth - iEndX;
+    for( iY=0; iY<iBlockHeight; iY++ )
+	{
+		//left
+		memset( pBufDst, pBufSrc[0], iNum );
+		pBufDst += iBufStride;
+		pBufSrc += iBufStride;
+		
+		//right
+		memset( pBufDst1, pBufSrc1[0], iNum1 );
+		pBufDst1 += iBufStride;
+		pBufSrc1 += iBufStride;
+    }
+}
+
+} // namespace WelsDec
--- /dev/null
+++ b/codec/decoder/core/src/utils.cpp
@@ -1,0 +1,307 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	utils.c
+ *
+ * \brief	common tool/function utilization
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+ 
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#if defined(WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#include <sys/timeb.h>
+#endif
+
+#include "utils.h"
+#include "macros.h"
+#include "wels_const.h"
+#include "cpu_core.h"
+#include "decoder_context.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+#include "mem_align.h"
+
+namespace WelsDec {
+
+// cache line size
+uint32_t g_uiCacheLineSize	= 16;
+// to fill default routines
+PWelsLogCallbackFunc g_pLog	= NULL;
+
+
+
+void_t WelsLog(void_t *pPtr, int32_t iLevel, const char *kpFmt, ...)
+{
+    va_list pVl;
+
+	PWelsDecoderContext pCtx  = (PWelsDecoderContext)pPtr;
+
+    va_start(pVl, kpFmt);
+    g_pLog(pCtx->pTraceHandle, iLevel, kpFmt, pVl);
+    va_end(pVl);
+}
+
+
+#if  defined(WIN32)
+
+#if  defined(_MSC_VER) && (_MSC_VER>=1500)
+
+int32_t WelsSnprintf(str_t * pBuffer,  int32_t iSizeOfBuffer, const str_t * kpFormat, ...)
+{
+	va_list  pArgPtr; 
+	int32_t  iRc;
+
+	va_start(pArgPtr, kpFormat);
+
+	iRc = vsnprintf_s(pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
+
+	va_end(pArgPtr);
+
+	return iRc;
+}
+
+str_t* WelsStrncpy(str_t * pDest, int32_t iSizeInBytes, const str_t * kpSrc, int32_t iCount)
+{
+    strncpy_s(pDest, iSizeInBytes, kpSrc, iCount);
+
+	return pDest;
+}
+
+int32_t WelsStrnlen(const str_t * kpStr,  int32_t iMaxlen)
+{
+	return strnlen_s(kpStr, iMaxlen);
+}
+
+int32_t WelsVsprintf(str_t * pBuffer, int32_t iSizeOfBuffer, const str_t * kpFormat, va_list pArgPtr)
+{
+	return vsprintf_s(pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
+}
+
+WelsFileHandle* WelsFopen(const str_t * kpFilename,  const str_t * kpMode)
+{
+	WelsFileHandle* pFp = NULL;
+	if( fopen_s(&pFp, kpFilename, kpMode) != 0 ){
+		return NULL;
+	}
+
+	return pFp;
+}
+
+int32_t WelsFclose(WelsFileHandle* pFp)
+{
+	return fclose(pFp);
+}
+
+int32_t WelsGetTimeOfDay(SWelsTime * pTp)
+{
+	return _ftime_s(pTp);
+}
+
+int32_t WelsStrftime(str_t * pBuffer, int32_t iSize, const str_t * kpFormat, const SWelsTime * kpTp)
+{
+	struct tm   sTimeNow;
+
+	localtime_s(&sTimeNow, &kpTp->time);
+
+	return strftime(pBuffer, iSize, kpFormat, &sTimeNow);
+}
+
+#else 
+
+int32_t WelsSnprintf(str_t * pBuffer,  int32_t iSizeOfBuffer, const str_t * kpFormat, ...)
+{
+	va_list pArgPtr;
+	int32_t iRc;
+
+	va_start(pArgPtr, kpFormat);
+
+    iRc = vsprintf(pBuffer, kpFormat, pArgPtr);//confirmed_safe_unsafe_usage
+
+	va_end(pArgPtr);
+
+	return iRc;
+}
+
+str_t* WelsStrncpy(str_t * pDest, int32_t iSizeInBytes, const str_t * kpSrc, int32_t iCount)
+{
+	strncpy(pDest, kpSrc, iCount);//confirmed_safe_unsafe_usage
+
+	return pDest;
+}
+
+int32_t WelsStrnlen(const str_t * kpStr,  int32_t iMaxlen)
+{
+	return strlen(kpStr);//confirmed_safe_unsafe_usage
+}
+
+int32_t WelsVsprintf(str_t * pBuffer, int32_t iSizeOfBuffer, const str_t * kpFormat, va_list pArgPtr)
+{
+	return vsprintf(pBuffer, kpFormat, pArgPtr);//confirmed_safe_unsafe_usage
+}
+
+
+WelsFileHandle* WelsFopen(const str_t * kpFilename,  const str_t * kpMode)
+{
+	return fopen(kpFilename, kpMode);
+}
+
+int32_t WelsFclose(WelsFileHandle* pFp)
+{
+	return fclose(pFp);
+}
+
+int32_t WelsGetTimeOfDay(SWelsTime * pTp)
+{
+	return _ftime(pTp);
+}
+
+int32_t WelsStrftime(str_t * pBuffer, int32_t iSize, const str_t * kpFormat, const SWelsTime * kpTp)
+{
+	struct tm  * pTnow;
+
+	pTnow = localtime(&kpTp->time);
+
+	return strftime(pBuffer, iSize, kpFormat, pTnow);
+}
+
+
+#endif // _MSC_VER
+
+#else  //GCC
+
+int32_t WelsSnprintf(str_t * pBuffer,  int32_t iSizeOfBuffer, const str_t * kpFormat, ...)
+{
+	va_list pArgPtr;
+	int32_t iRc;
+
+	va_start(pArgPtr, kpFormat);
+
+    iRc = vsnprintf(pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
+
+	va_end(pArgPtr);
+
+	return iRc;
+}
+
+str_t* WelsStrncpy(str_t * pDest, int32_t iSizeInBytes, const str_t * kpSrc, int32_t iCount)
+{
+    return strncpy(pDest, kpSrc, iCount);//confirmed_safe_unsafe_usage	
+}
+
+#if !defined(MACOS) && !defined(UNIX) && !defined(APPLE_IOS)
+int32_t WelsStrnlen(const str_t * kpStr,  int32_t iMaxlen)
+{
+	return strnlen(kpStr, iMaxlen);//confirmed_safe_unsafe_usage
+}
+#else
+int32_t WelsStrnlen(const str_t *kpString, int32_t iMaxlen)
+{
+	// In mac os, there is no strnlen in string.h, we can only use strlen instead of strnlen or
+	// implement strnlen by ourself
+	
+#if 1
+	return strlen(pString);//confirmed_safe_unsafe_usage
+#else	
+	const str_t *kpSrc;
+	for (kpSrc = kpString; iMaxlen-- && *kpSrc != '\0'; ++kpSrc)
+		return kpSrc - kpString;
+#endif
+	
+}
+#endif
+
+int32_t WelsVsprintf(str_t * pBuffer, int32_t iSizeOfBuffer, const str_t * kpFormat, va_list pArgPtr)
+{
+	return vsprintf(pBuffer, kpFormat, pArgPtr);//confirmed_safe_unsafe_usage
+}
+
+WelsFileHandle* WelsFopen(const str_t * kpFilename,  const str_t * kpMode)
+{
+	return fopen(kpFilename, kpMode);
+}
+
+int32_t WelsFclose(WelsFileHandle  * pFp)
+{
+	return fclose(pFp);
+}
+
+int32_t WelsGetTimeOfDay(SWelsTime * pTp)
+{
+        struct timeval  sTv;
+
+        if( gettimeofday(&sTv, NULL) ){
+             return -1;
+        }
+
+        pTp->time = sTv.tv_sec;
+        pTp->millitm = (uint16_t)sTv.tv_usec/1000;
+
+        return 0;
+}
+
+int32_t WelsStrftime(str_t * pBuffer, int32_t iSize, const str_t * kpFormat, const SWelsTime * kpTp)
+{
+	struct tm  * pTnow;
+        
+	pTnow = localtime(&kpTp->time);
+
+	return strftime(pBuffer, iSize, kpFormat, pTnow);
+}
+
+#endif
+
+
+int32_t WelsFwrite(const void_t * kpBuffer, int32_t iSize, int32_t iCount, WelsFileHandle* pFp)
+{
+	return fwrite(kpBuffer, iSize, iCount, pFp);
+}
+
+uint16_t WelsGetMillsecond(const SWelsTime * kpTp)
+{
+	return kpTp->millitm;
+}
+
+int32_t WelsFflush(WelsFileHandle* pFp)
+{
+	return fflush(pFp);
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/plus/inc/StdAfx.h
@@ -1,0 +1,56 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// stdafx.h : include file for standard system include files,
+//  or project specific include files that are used frequently, but
+//      are changed infrequently
+//
+
+#if !defined(AFX_STDAFX_H__695FA8A5_BDC4_4206_8B7D_EE290F91E766__INCLUDED_)
+#define AFX_STDAFX_H__695FA8A5_BDC4_4206_8B7D_EE290F91E766__INCLUDED_
+
+#if _MSC_VER > 1000
+#pragma once
+#endif // _MSC_VER > 1000
+
+
+// Insert your headers here
+#define WIN32_LEAN_AND_MEAN		// Exclude rarely-used stuff from Windows headers
+
+#include <windows.h>
+
+// TODO: reference additional headers your program requires here
+
+//{{AFX_INSERT_LOCATION}}
+// Microsoft Visual C++ will insert additional declarations immediately before the previous line.
+
+#endif // !defined(AFX_STDAFX_H__695FA8A5_BDC4_4206_8B7D_EE290F91E766__INCLUDED_)
--- /dev/null
+++ b/codec/decoder/plus/inc/welsCodecTrace.h
@@ -1,0 +1,172 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_CODEC_TRACE
+#define WELS_CODEC_TRACE
+
+
+#include "typedefs.h"
+
+//using namespace WelsDec;
+namespace WelsDec {
+
+#ifdef WIN32
+typedef int ( *CM_WELS_TRACE)( const char* kpFormat, ...);
+#else
+typedef int ( *CM_WELS_TRACE)( const char* kpDllName, const char* kpFormat, ...);
+#endif
+
+
+typedef  enum {
+	Wels_Trace_Type     = 0,
+	Wels_Trace_Type_File    = 1,
+	Wels_Trace_Type_WinDgb  = 2,
+} EWelsTraceType;
+
+class  IWelsTrace 
+{
+public:
+	enum {
+		WELS_LOG_QUIET     = 0,
+		WELS_LOG_ERROR     = 1 << 0,
+		WELS_LOG_WARNING   = 1 << 1,
+		WELS_LOG_INFO      = 1 << 2,
+		WELS_LOG_DEBUG     = 1 << 3,
+		WELS_LOG_RESV      = 1 << 4,
+	    WELS_LOG_DEFAULT   = WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG,
+
+
+		MAX_LOG_SIZE       = 1024,
+	};
+
+	virtual ~IWelsTrace() {};
+
+	virtual int32_t  SetTraceLevel(int32_t iLevel) = 0;
+	virtual int32_t  Trace(const int32_t kLevel, const str_t * kpFormat,  va_list pVl) = 0;
+
+	static void_t  WelsTrace(void_t* pObject, const int32_t kLevel, const str_t * kpFormat, va_list pVl)
+	{
+		IWelsTrace  * pThis = (IWelsTrace*)(pObject);
+
+		if( pThis ){
+			pThis->Trace(kLevel, kpFormat, pVl);
+		}
+	}
+
+	static void_t WelsVTrace(void_t *pObject, const int32_t kLevel, const str_t *kpFormat, ...)
+	{
+		IWelsTrace * pThis = (IWelsTrace *)(pObject);
+
+		va_list  argptr;	
+
+		va_start(argptr, kpFormat);	
+
+		if( pThis ){
+			pThis->Trace(kLevel, kpFormat, argptr);		
+		}
+
+		va_end(argptr);
+	}
+
+
+};
+
+class CWelsTraceBase : public IWelsTrace
+{
+public:
+	virtual int32_t  SetTraceLevel(int32_t iLevel);
+	virtual int32_t  Trace(const int32_t kLevel, const str_t * kpFormat,  va_list pVl);
+
+    virtual int32_t  WriteString(int32_t iLevel, const str_t * pStr) = 0;
+protected:
+	CWelsTraceBase() 
+	{
+		m_iLevel = WELS_LOG_DEFAULT;
+	};
+
+private:
+	int32_t   m_iLevel;
+};
+
+class CWelsTraceFile : public CWelsTraceBase
+{
+public:
+	CWelsTraceFile(const str_t  * filename = (const str_t *)"wels_decoder_trace.txt");
+	virtual ~CWelsTraceFile();
+
+public:
+	virtual int32_t  WriteString(int32_t iLevel, const str_t * pStr);
+
+private:
+    WelsFileHandle* m_pTraceFile;
+};
+
+#ifdef  WIN32
+class CWelsTraceWinDgb : public CWelsTraceBase
+{
+public:
+	CWelsTraceWinDgb() {};
+	virtual ~CWelsTraceWinDgb() {};
+
+public:
+	virtual int32_t  WriteString(int32_t iLevel, const str_t * pStr);
+};
+#endif
+
+class CWelsCodecTrace : public CWelsTraceBase
+{
+public:
+	CWelsCodecTrace() ;
+	virtual ~CWelsCodecTrace();
+
+public:
+	virtual int32_t  WriteString(int32_t iLevel, const str_t * pStr);
+
+protected:
+	int32_t  LoadWelsTraceModule();
+	int32_t  UnloadWelsTraceModule();
+
+private:
+    void_t  * m_hTraceHandle;
+
+    CM_WELS_TRACE m_fpDebugTrace;
+	CM_WELS_TRACE m_fpInfoTrace;
+	CM_WELS_TRACE m_fpWarnTrace;
+	CM_WELS_TRACE m_fpErrorTrace;
+};
+
+
+IWelsTrace  * CreateWelsTrace(EWelsTraceType  eType,  void_t * pParam = NULL);
+
+} // namespace WelsDec
+
+#endif //WELS_CODEC_TRACE
--- /dev/null
+++ b/codec/decoder/plus/inc/welsDecoderExt.h
@@ -1,0 +1,121 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  WelsDecoderExt.h
+ *
+ *  Abstract
+ *      Cisco OpenH264 decoder extension utilization interface
+ *
+ *  History
+ *      3/12/2009 Created
+ *
+ *
+ *************************************************************************/
+#if !defined(AFX_WELSH264DECODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
+#define AFX_WELSH264DECODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_
+
+#include "codec_api.h"
+#include "codec_app_def.h"
+#include "decoder_context.h"
+#include "welsCodecTrace.h"
+
+#if _MSC_VER > 1000
+#pragma once
+#endif // _MSC_VER > 1000
+
+
+class ISVCDecoder;
+
+namespace WelsDec {
+
+//#define OUTPUT_BIT_STREAM  ////for test to output bitstream
+
+class CWelsDecoder : public ISVCDecoder  
+{
+public:
+	CWelsDecoder(void_t);
+	virtual ~CWelsDecoder();
+
+	virtual long Initialize(void_t* pParam, const INIT_TYPE keInitType);
+	virtual long Unintialize();		
+	
+	/***************************************************************************
+	*	Description:
+	*		Decompress one frame, and output RGB24 or YV12 decoded stream and its length.
+	*	Input parameters:
+	*       Parameter		TYPE			       Description
+	*       pSrc             unsigned char*         the h264 stream to decode
+	*       srcLength       int                    the length of h264 steam
+	*       pDst             unsigned char*         buffer pointer of decoded data
+	*       pDstInfo        SBufferInfo&           information provided to API including width, height, SW/HW option, etc
+	*
+	*	return: if decode frame success return 0, otherwise corresponding error returned.
+	/***************************************************************************/
+	virtual DECODING_STATE DecodeFrame(	const unsigned char* kpSrc,
+		                                const int kiSrcLen,	
+		                                unsigned char** ppDst,
+		                                int* pStride,
+		                                int& iWidth,
+		                                int& iHeight	);
+
+	virtual DECODING_STATE DecodeFrame(	const unsigned char* kpSrc,
+											const int kiSrcLen,	
+											void_t ** ppDst,
+											SBufferInfo* pDstInfo);
+	virtual DECODING_STATE DecodeFrameEx( const unsigned char * kpSrc,
+		                                  const int kiSrcLen,
+		                                  unsigned char * pDst,
+										  int iDstStride,
+		                                  int & iDstLen,
+		                                  int & iWidth,
+		                                  int & iHeight,
+		                                  int & color_format);
+
+    virtual long SetOption(DECODER_OPTION eOptID, void_t* pOption);
+	virtual long GetOption(DECODER_OPTION eOptID, void_t* pOption);
+
+private:	
+	PWelsDecoderContext 				m_pDecContext;
+	IWelsTrace							*m_pTrace;
+	
+	void_t InitDecoder( void_t );
+	void_t UninitDecoder( void_t );
+	
+#ifdef OUTPUT_BIT_STREAM
+	WelsFileHandle* m_pFBS;
+	WelsFileHandle* m_pFBSSize;
+#endif//OUTPUT_BIT_STREAM
+	
+};
+
+} // namespace WelsDec
+
+#endif // !defined(AFX_WELSH264DECODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
--- /dev/null
+++ b/codec/decoder/plus/res/resource.h
@@ -1,0 +1,15 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Developer Studio generated include file.
+// Used by welsdec.rc
+//
+
+// Next default values for new objects
+// 
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        101
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1000
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
binary files /dev/null b/codec/decoder/plus/res/welsdec.aps differ
--- /dev/null
+++ b/codec/decoder/plus/res/welsdec.rc
@@ -1,0 +1,115 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+#include "afxres.h"
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// Chinese (P.R.C.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_CHS)
+#ifdef _WIN32
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+#pragma code_page(936)
+#endif //_WIN32
+
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE 
+BEGIN
+    "resource.h\0"
+END
+
+2 TEXTINCLUDE 
+BEGIN
+    "#include ""afxres.h""\r\n"
+    "\0"
+END
+
+3 TEXTINCLUDE 
+BEGIN
+    "\r\n"
+    "\0"
+END
+
+#endif    // APSTUDIO_INVOKED
+
+#endif    // Chinese (P.R.C.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////
+// English (U.S.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+#ifdef _WIN32
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+#pragma code_page(1252)
+#endif //_WIN32
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION 0,0,0,0
+ PRODUCTVERSION 0,0,0,0
+ FILEFLAGSMASK 0x3fL
+#ifdef _DEBUG
+ FILEFLAGS 0x1L
+#else
+ FILEFLAGS 0x0L
+#endif
+ FILEOS 0x40004L
+ FILETYPE 0x2L
+ FILESUBTYPE 0x0L
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904b0"
+        BEGIN
+            VALUE "Comments", "Cisco OpenH264 decoder"
+            VALUE "CompanyName", "Cisco system"
+            VALUE "FileDescription", "Cisco OpenH264 decoder"
+            VALUE "FileVersion", "0, 0, 0, 0"
+            VALUE "InternalName", "welsdec.dll"
+            VALUE "LegalCopyright", "� 2011-2015 Cisco and/or its affiliates. All rights reserved."
+            VALUE "OriginalFilename", "welsdec.dll"
+            VALUE "ProductName", "Cisco OpenH264 decoder"
+            VALUE "ProductVersion", "0, 0, 0, 0"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1200
+    END
+END
+
+#endif    // English (U.S.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+
+
+/////////////////////////////////////////////////////////////////////////////
+#endif    // not APSTUDIO_INVOKED
+
--- /dev/null
+++ b/codec/decoder/plus/src/StdAfx.cpp
@@ -1,0 +1,40 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// stdafx.cpp : source file that includes just the standard includes
+//	WelsDecPlus.pch will be the pre-compiled header
+//	stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
--- /dev/null
+++ b/codec/decoder/plus/src/welsCodecTrace.cpp
@@ -1,0 +1,419 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef WIN32
+#include <windows.h>
+#include <tchar.h>
+#endif
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+#include "utils.h"
+
+#include "welsCodecTrace.h"
+#include "utils.h"
+#if defined LINUX || defined SOLARIS || defined UNIX || defined MACOS //LINUX/SOLARIS/UNIX
+#include <dlfcn.h>
+#endif
+
+#if defined(MACOS)
+#include <carbon/carbon.h>
+#include <CoreFoundation/CFBundle.h>
+#endif//MACOS
+
+//using namespace WelsDec;
+
+namespace WelsDec {
+
+#ifdef MACOS
+static CFBundleRef LoadLibrary(const char* lpszbundle)
+{
+	// 1.get bundle path
+	char cBundlePath[PATH_MAX];
+	memset(cBundlePath, 0, PATH_MAX);
+	
+	Dl_info 	dlInfo;
+	static int  sDummy;
+	dladdr((void_t*)&sDummy, &dlInfo);
+	
+	strlcpy(cBundlePath, dlInfo.dli_fname, PATH_MAX);
+	
+	char * pPath = NULL;
+	for(int i = 4; i > 0; i--)
+	{
+		pPath = strrchr(cBundlePath,'/');//confirmed_safe_unsafe_usage
+		if(pPath)
+		{
+			*pPath = 0;
+		}
+		else
+		{
+			break;
+		}
+	}
+	if(pPath)
+	{
+		strlcat(cBundlePath, "/", PATH_MAX);
+	}
+	else
+	{
+		return NULL;
+	}
+	
+	strlcat(cBundlePath, lpszbundle, PATH_MAX);
+	
+	FSRef bundlePath;
+	OSStatus iStatus = FSPathMakeRef((unsigned char*)cBundlePath, &bundlePath, NULL);
+	if(noErr != iStatus)
+		return NULL;
+	
+	CFURLRef bundleURL = CFURLCreateFromFSRef(kCFAllocatorSystemDefault, &bundlePath);
+	if(NULL == bundleURL)
+		return NULL;
+	
+	// 2.get bundle ref
+	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
+	CFRelease(bundleURL);
+	
+//	Boolean bReturn = FALSE;
+	if(NULL != bundleRef)
+	{
+		//	bReturn = CFBundleLoadExecutable(bundleRef);
+	}
+	
+	return bundleRef;
+}
+
+static Boolean FreeLibrary(CFBundleRef bundle)
+{	
+	if(NULL != bundle)
+	{
+		//	CFBundleUnloadExecutable(bundle);
+		CFRelease(bundle);
+	}
+	
+	return TRUE;
+}
+
+static void_t* GetProcessAddress(CFBundleRef bundle, const char* lpszprocname)
+{
+	if(NULL == bundle)
+		return NULL;
+	
+	CFStringRef cfprocname = CFStringCreateWithCString(NULL,lpszprocname,CFStringGetSystemEncoding());
+	void_t *processAddress = CFBundleGetFunctionPointerForName(bundle,cfprocname);
+	CFRelease(cfprocname);
+	
+	return processAddress;
+}
+#endif
+
+
+
+int32_t  CWelsTraceBase::SetTraceLevel(int iLevel)
+{
+	m_iLevel = iLevel;
+
+	return 0;
+}
+
+int32_t  CWelsTraceBase::Trace(const int kLevel, const str_t *kpFormat, va_list pVl)
+{
+	if( kLevel & m_iLevel ){
+		str_t chWStrFormat[MAX_LOG_SIZE] = {0};
+		str_t chBuf[MAX_LOG_SIZE] = {0};
+		str_t chResult[MAX_LOG_SIZE] = {0};
+		const int32_t kLen	= WelsStrnlen((const str_t *)"[DECODER]: ", MAX_LOG_SIZE);
+
+		WelsStrncpy(chWStrFormat, MAX_LOG_SIZE, (const str_t *)kpFormat, WelsStrnlen((const str_t *)kpFormat, MAX_LOG_SIZE));	
+
+		WelsStrncpy(chBuf, MAX_LOG_SIZE, (const str_t *)"[DECODER]: ", kLen);
+
+		WelsVsprintf((chBuf + kLen),  MAX_LOG_SIZE - kLen, (const str_t *)kpFormat, pVl);
+		WelsStrncpy(chResult, MAX_LOG_SIZE, (const str_t *)chBuf, WelsStrnlen((const str_t *)chBuf, MAX_LOG_SIZE));
+
+		WriteString(kLevel, chResult);
+	}
+
+	return 0;
+}
+
+CWelsTraceFile::CWelsTraceFile(const str_t * pFileName)
+{
+	m_pTraceFile = WelsFopen(pFileName, (const str_t *)"wt");
+}
+
+CWelsTraceFile::~CWelsTraceFile()
+{
+	if( m_pTraceFile ){
+		WelsFclose(m_pTraceFile);
+		m_pTraceFile = NULL;
+	}
+}
+
+int32_t CWelsTraceFile::WriteString(int32_t iLevel, const str_t * pStr)
+{
+	int  iRC = 0;
+	const static str_t chEnter[16] = "\n";
+	if( m_pTraceFile ){
+		iRC += WelsFwrite(pStr, 1, WelsStrnlen(pStr, MAX_LOG_SIZE), m_pTraceFile);
+		iRC += WelsFwrite(chEnter, 1, WelsStrnlen(chEnter,  16), m_pTraceFile);
+		WelsFflush(m_pTraceFile);
+	}
+	return iRC;
+}
+
+
+#ifdef WIN32
+
+int32_t CWelsTraceWinDgb::WriteString(int32_t iLevel, const str_t * pStr)
+{
+	OutputDebugStringA(pStr);
+
+	return WelsStrnlen(pStr, MAX_LOG_SIZE);//strnlen(pStr, MAX_LOG_SIZE);
+}
+
+#endif
+
+CWelsCodecTrace::CWelsCodecTrace()
+{
+	m_hTraceHandle = NULL;
+    m_fpDebugTrace = NULL;
+	m_fpInfoTrace = NULL;
+	m_fpWarnTrace = NULL;
+	m_fpErrorTrace = NULL;
+
+	LoadWelsTraceModule();
+}
+
+CWelsCodecTrace::~CWelsCodecTrace()
+{
+	UnloadWelsTraceModule();
+}
+
+int32_t  CWelsCodecTrace::LoadWelsTraceModule()
+{	
+#if defined WIN32	
+	HMODULE hHandle = ::LoadLibrary("welstrace.dll");
+//	HMODULE handle = ::LoadLibrary("contrace.dll");  // for c7 trace
+	if ( NULL == hHandle )
+		return -1;
+
+	CHAR chPath[ _MAX_PATH]= {0};
+	GetModuleFileName( (HMODULE)hHandle, chPath, _MAX_PATH);
+
+	m_hTraceHandle = ::LoadLibrary(chPath);
+	
+	OutputDebugStringA(chPath);
+	if( m_hTraceHandle) {
+		m_fpDebugTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSDEBUGA");
+		m_fpInfoTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSINFOA");
+		m_fpWarnTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSWARNA");
+		m_fpErrorTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSERRORA");
+	}
+
+	// coverity scan uninitial
+	if (hHandle != NULL)
+	{
+		::FreeLibrary(hHandle);
+		hHandle = NULL;
+	}
+#elif defined MACOS
+	m_hTraceHandle = LoadLibrary("welstrace.bundle");
+	if(m_hTraceHandle) {
+		m_fpDebugTrace = ( CM_WELS_TRACE)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSDEBUG2");
+		m_fpInfoTrace = ( CM_WELS_TRACE)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSINFO2");
+		m_fpWarnTrace = ( CM_WELS_TRACE)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSWARN2");
+		m_fpErrorTrace = ( CM_WELS_TRACE)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSERROR2");
+	}
+#elif defined LINUX || defined SOLARIS || defined UNIX
+//#else
+//	CCmString	cmPath;
+	str_t chPath[255]= {0};
+	Dl_info		sDlInfo;
+	static int	iMmTPAddress;
+    dladdr( &iMmTPAddress, &sDlInfo);
+
+	if (NULL == sDlInfo.dli_fname)
+		return -1;
+	WelsStrncpy(chPath, 255, (const str_t*)sDlInfo.dli_fname, WelsStrnlen((const str_t*)sDlInfo.dli_fname, 255));
+	str_t* p = strrchr(chPath, '/');//confirmed_safe_unsafe_usage
+	if ( NULL == p )
+		return -1;
+	const int iLenTraceName = WelsStrnlen((const str_t*)"/libwelstrace.so", 15);
+	const int iCurPos = p - chPath;
+	if ( iCurPos + iLenTraceName < 255 )
+		WelsStrncpy(p, 254-iCurPos, (const str_t*)"/libwelstrace.so", iLenTraceName );
+	else
+		return -1;
+
+	m_hTraceHandle = dlopen( chPath, RTLD_LAZY);
+	if (m_hTraceHandle == NULL)
+	{
+		WelsFileHandle* fp = WelsFopen((const str_t*)"/tmp/trace.txt", (const str_t*)"a");
+		if(fp)
+		{
+			fprintf(fp, "welsCodecTrace::welsCodecTrace ===> dlopen %s fail, %s\n", chPath, dlerror());
+			WelsFclose(fp);
+		}
+		return -1;
+	}
+	if (m_hTraceHandle) {
+		m_fpDebugTrace = ( CM_WELS_TRACE)dlsym( m_hTraceHandle, "WELSDEBUG2");
+		m_fpInfoTrace = ( CM_WELS_TRACE)dlsym( m_hTraceHandle, "WELSINFO2");
+		m_fpWarnTrace = ( CM_WELS_TRACE)dlsym( m_hTraceHandle, "WELSWARN2");
+		m_fpErrorTrace = ( CM_WELS_TRACE)dlsym( m_hTraceHandle, "WELSERROR2");
+		if(m_fpDebugTrace == NULL)
+		{
+			WelsFileHandle* fp = WelsFopen((const str_t*)"/tmp/trace.txt", (const str_t*)"a");
+			if(fp)
+			{
+				printf("welsCodecTrace::welsCodecTrace ===> dlsym failed (WELSDEBUG2) , dlerror = %s\n", dlerror());
+				WelsFclose(fp);
+			}
+			return -1;
+		}
+	}
+#endif
+	return 0;
+}
+
+int32_t  CWelsCodecTrace::UnloadWelsTraceModule()
+{
+#if defined WIN32
+	if( m_hTraceHandle) {
+		::FreeLibrary( ( HMODULE)m_hTraceHandle);
+	}
+#elif defined MACOS
+	if (m_hTraceHandle) {
+		FreeLibrary( (CFBundleRef)m_hTraceHandle);
+	}
+#elif defined LINUX || defined SOLARIS || defined UNIX
+	if (m_hTraceHandle) {
+		::dlclose( m_hTraceHandle);
+	}
+#endif
+
+	m_hTraceHandle = NULL;
+	m_fpDebugTrace = NULL;
+	m_fpInfoTrace = NULL;
+	m_fpWarnTrace = NULL;
+	m_fpErrorTrace = NULL;
+	return 0;
+}
+
+int32_t  CWelsCodecTrace::WriteString(int32_t iLevel, const str_t * pStr)
+{
+	if( m_hTraceHandle )
+	{
+#ifdef WIN32
+		switch(iLevel)
+		{
+		case WELS_LOG_ERROR:
+			if(m_fpErrorTrace)
+				m_fpErrorTrace("%s", pStr);
+			break;
+		case WELS_LOG_WARNING:
+			if(m_fpWarnTrace)
+				m_fpWarnTrace("%s", pStr);
+			break;
+		case WELS_LOG_INFO:
+			if(m_fpInfoTrace)
+				m_fpInfoTrace("%s", pStr);
+			break;
+		case WELS_LOG_DEBUG:
+			if(m_fpDebugTrace)
+				m_fpDebugTrace("%s", pStr);
+			break;
+		default:
+			if(m_fpDebugTrace)
+				m_fpInfoTrace("%s", pStr);
+			break;
+		}
+#else
+		switch(iLevel)
+		{
+		case WELS_LOG_ERROR:
+			if(m_fpErrorTrace)
+				m_fpErrorTrace("CODEC", "%s", pStr);
+			break;
+		case WELS_LOG_WARNING:
+			if(m_fpWarnTrace)
+				m_fpWarnTrace("CODEC", "%s",  pStr);
+			break;
+		case WELS_LOG_INFO:
+			if(m_fpInfoTrace)
+				m_fpInfoTrace("CODEC", "%s",  pStr);
+			break;
+		case WELS_LOG_DEBUG:
+			if(m_fpInfoTrace)
+				m_fpInfoTrace("CODEC", "%s",  pStr);
+			break;
+		default:
+			if(m_fpInfoTrace)
+				m_fpInfoTrace("CODEC", "%s",  pStr);
+			break;
+		}
+#endif
+	}
+
+	return 0;
+}
+
+
+IWelsTrace  * CreateWelsTrace(EWelsTraceType  eType,  void_t * pParam)
+{
+	IWelsTrace  * pTrace = NULL;
+	switch(eType)
+	{
+	case Wels_Trace_Type:
+		pTrace = new CWelsCodecTrace();
+		break;
+	case Wels_Trace_Type_File:
+		pTrace = new CWelsTraceFile();
+		break;
+#ifdef WIN32
+	case Wels_Trace_Type_WinDgb:
+		pTrace = new CWelsTraceWinDgb();
+		break;
+#endif
+	default:
+		break;
+	}
+
+	return pTrace;
+}
+
+} // namespace WelsDec
\ No newline at end of file
--- /dev/null
+++ b/codec/decoder/plus/src/welsDecoderExt.cpp
@@ -1,0 +1,532 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  welsDecoderExt.cpp
+ *
+ *  Abstract
+ *      Cisco OpenH264 decoder extension utilization
+ *
+ *  History
+ *      3/12/2009 Created
+ *
+ *
+ ************************************************************************/
+//#include <assert.h>
+#include "welsDecoderExt.h"
+#include "welsCodecTrace.h"
+#include "codec_def.h"
+#include "typedefs.h"
+#include "mem_align.h"
+#include "utils.h"
+
+//#include "macros.h"
+#include "decoder.h"
+
+extern "C" {
+#include "decoder_core.h"
+#include "manage_dec_ref.h"
+}
+#include "error_code.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like util for cross platforms
+#include <time.h>
+#if defined(WIN32) /*&& defined(_DEBUG)*/
+
+#include <windows.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#endif
+
+namespace WelsDec {
+
+//////////////////////////////////////////////////////////////////////
+// Construction/Destruction
+//////////////////////////////////////////////////////////////////////
+
+/***************************************************************************
+*	Description:
+*			class CWelsDecoder constructor function, do initialization	and    
+*       alloc memory required
+*
+*	Input parameters: none
+*
+*	return: none
+/***************************************************************************/
+CWelsDecoder::CWelsDecoder(void_t)
+:	m_pDecContext( NULL ),
+	m_pTrace( NULL )
+{
+#ifdef OUTPUT_BIT_STREAM
+	str_t chFileName[1024] = { 0 };  //for .264
+	int iBufUsed = 0;
+	int iBufLeft = 1023;
+
+	str_t chFileNameSize[1024] = { 0 }; //for .len
+	int iBufUsedSize = 0;
+	int iBufLeftSize = 1023;
+#endif//OUTPUT_BIT_STREAM 
+
+	m_pTrace = CreateWelsTrace(Wels_Trace_Type);	
+
+	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO,"CWelsDecoder::CWelsDecoder() entry");
+   
+	
+#ifdef OUTPUT_BIT_STREAM
+    SWelsTime sCurTime;
+
+	WelsGetTimeOfDay(&sCurTime);	
+	
+	iBufUsed      += WelsSnprintf(chFileName,  iBufLeft,  "bs_0x%p_", (void_t*)this);
+	iBufUsedSize += WelsSnprintf(chFileNameSize, iBufLeftSize, "size_0x%p_", (void_t*)this);
+
+	iBufLeft -= iBufUsed;
+	if ( iBufLeft > iBufUsed )
+	{
+		iBufUsed += WelsStrftime(&chFileName[iBufUsed], iBufLeft, "%y%m%d%H%M%S", &sCurTime);
+		iBufLeft -= iBufUsed;
+	}
+
+	iBufLeftSize -= iBufUsedSize;
+	if ( iBufLeftSize> iBufUsedSize )
+	{	
+		iBufUsedSize += WelsStrftime(&chFileNameSize[iBufUsedSize], iBufLeftSize, "%y%m%d%H%M%S", &sCurTime);
+		iBufLeftSize -= iBufUsedSize;
+	}
+
+	if ( iBufLeft > iBufUsed )
+	{
+		iBufUsed += WelsSnprintf(&chFileName[iBufUsed], iBufLeft, ".%03.3u.264", WelsGetMillsecond(&sCurTime));
+		iBufLeft -= iBufUsed;
+	}
+
+	if ( iBufLeftSize > iBufUsedSize )
+	{
+        iBufUsedSize += WelsSnprintf(&chFileNameSize[iBufUsedSize], iBufLeftSize, ".%03.3u.len", WelsGetMillsecond(&sCurTime));
+		iBufLeftSize -= iBufUsedSize;
+	}
+	
+
+	m_pFBS = WelsFopen(chFileName, "wb");
+	m_pFBSSize = WelsFopen(chFileNameSize, "wb");	
+#endif//OUTPUT_BIT_STREAM
+		
+}
+
+/***************************************************************************
+*	Description:
+*			class CWelsDecoder destructor function, destroy allocced memory
+*       
+*	Input parameters: none
+*
+*	return: none
+/***************************************************************************/
+CWelsDecoder::~CWelsDecoder()
+{		
+	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::~CWelsDecoder()");
+
+	UninitDecoder();
+
+#ifdef OUTPUT_BIT_STREAM
+	if ( m_pFBS )
+	{
+		WelsFclose( m_pFBS );
+		m_pFBS = NULL;
+	}
+	if ( m_pFBSSize )
+	{
+		WelsFclose( m_pFBSSize );
+		m_pFBSSize = NULL;
+	}
+#endif//OUTPUT_BIT_STREAM
+
+	if( NULL != m_pTrace ){
+		delete m_pTrace;
+		m_pTrace = NULL;
+	}	
+}
+
+long CWelsDecoder::Initialize(void_t* pParam, const INIT_TYPE keInitType)
+{
+	if ( pParam == NULL || keInitType != INIT_TYPE_PARAMETER_BASED ){
+		IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::Initialize(), invalid input argument.");
+		return cmInitParaError;
+	}
+
+	// H.264 decoder initialization,including memory allocation,then open it ready to decode
+	InitDecoder();
+
+	DecoderConfigParam( m_pDecContext, pParam );
+	
+	return cmResultSuccess;
+}
+
+long CWelsDecoder::Unintialize()
+{
+	UninitDecoder();
+	
+	return ERR_NONE;
+}
+
+void_t CWelsDecoder::UninitDecoder( void_t )
+{
+	if ( NULL == m_pDecContext )
+		return;
+	
+	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "into CWelsDecoder::uninit_decoder()..");
+
+	WelsEndDecoder( m_pDecContext );
+
+	if ( NULL != m_pDecContext )
+	{
+		WelsFree( m_pDecContext, "m_pDecContext" );
+
+		m_pDecContext	= NULL;
+	}
+
+	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "left CWelsDecoder::uninit_decoder()..");
+}
+
+// the return value of this function is not suitable, it need report failure info to upper layer.
+void_t CWelsDecoder::InitDecoder( void_t )
+{
+	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::init_decoder()..");	
+
+	m_pDecContext	= (PWelsDecoderContext)WelsMalloc( sizeof(SWelsDecoderContext), "m_pDecContext" );
+	
+	WelsInitDecoder( m_pDecContext, m_pTrace, IWelsTrace::WelsTrace );
+
+	IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "CWelsDecoder::init_decoder().. left");
+}
+
+/*
+ * Set Option	
+ */
+long CWelsDecoder::SetOption(DECODER_OPTION eOptID, void_t* pOption)
+{
+	int iVal = 0;
+	
+	if ( m_pDecContext == NULL )
+		return dsInitialOptExpected;
+	
+	if ( eOptID == DECODER_OPTION_DATAFORMAT ) // Set color space of decoding output frame
+	{		
+		if ( pOption == NULL )
+			return cmInitParaError;
+		
+		iVal = *((int*)pOption);	// is_rgb
+		
+		return DecoderSetCsp( m_pDecContext, iVal );
+	}
+	else if ( eOptID == DECODER_OPTION_END_OF_STREAM ) // Indicate bit-stream of the final frame to be decoded
+	{
+		if ( pOption == NULL )
+			return cmInitParaError;
+		
+		iVal	= *((int*)pOption);	// boolean value for whether enabled End Of Stream flag
+
+		m_pDecContext->bEndOfStreamFlag	= iVal ? true : false;
+		
+		return cmResultSuccess;
+	}
+	else if ( eOptID == DECODER_OPTION_MODE)
+	{
+		if ( pOption == NULL )
+			return cmInitParaError;
+
+		iVal = *((int *)pOption);
+
+		m_pDecContext->iSetMode = iVal;
+		if(iVal == SW_MODE)
+		{
+			m_pDecContext->iDecoderOutputProperty = BUFFER_HOST;
+		}
+		else
+		{
+#if !defined(__APPLE__)
+			m_pDecContext->iDecoderOutputProperty = BUFFER_DEVICE;
+#else
+			m_pDecContext->iDecoderOutputProperty = BUFFER_HOST;//BUFFER_HOST;//BUFFER_DEVICE;
+#endif
+			
+		}
+		
+		return cmResultSuccess;
+	}
+	else if ( eOptID == DECODER_OPTION_OUTPUT_PROPERTY)
+	{
+		if ( pOption == NULL)
+			return cmInitParaError;
+
+		iVal = *((int *)pOption);
+		if( m_pDecContext->iSetMode != SW_MODE)	
+			m_pDecContext->iDecoderOutputProperty = iVal;
+	}
+
+
+	return cmInitParaError;
+}
+
+/*
+ *	Get Option
+ */
+long CWelsDecoder::GetOption(DECODER_OPTION eOptID, void_t* pOption)
+{
+	int iVal = 0;
+	
+	if ( m_pDecContext == NULL )
+		return cmInitExpected;
+	
+	if ( pOption == NULL )
+		return cmInitParaError;
+	
+	if ( DECODER_OPTION_DATAFORMAT == eOptID ){
+		iVal = m_pDecContext->iOutputColorFormat;
+		*((int*)pOption)	= iVal;
+		return cmResultSuccess;
+	}
+	else if ( DECODER_OPTION_END_OF_STREAM == eOptID ){
+		iVal	= m_pDecContext->bEndOfStreamFlag;
+		*((int*)pOption)	= iVal;
+		return cmResultSuccess;
+	}
+#ifdef LONG_TERM_REF
+	else if ( DECODER_OPTION_IDR_PIC_ID == eOptID ){
+		iVal = m_pDecContext->uiCurIdrPicId;
+		*((int*)pOption) = iVal;
+		return cmResultSuccess;
+	}
+	else if ( DECODER_OPTION_FRAME_NUM == eOptID)
+	{
+		iVal = m_pDecContext->iFrameNum;
+		*((int*)pOption) = iVal;
+		return cmResultSuccess;
+	}
+	else if ( DECODER_OPTION_LTR_MARKING_FLAG == eOptID )
+	{
+		iVal = m_pDecContext->bCurAuContainLtrMarkSeFlag;
+		*((int*)pOption) = iVal;
+		return cmResultSuccess;
+	}
+	else if ( DECODER_OPTION_LTR_MARKED_FRAME_NUM == eOptID )
+	{
+		iVal = m_pDecContext->iFrameNumOfAuMarkedLtr;
+		*((int*)pOption) = iVal;
+		return cmResultSuccess;
+	}
+#endif
+	else if ( DECODER_OPTION_VCL_NAL == eOptID ) //feedback whether or not have VCL NAL in current AU
+	{
+		iVal = m_pDecContext->iFeedbackVclNalInAu;
+		*((int*)pOption) = iVal;
+		return cmResultSuccess;
+	}
+	else if ( DECODER_OPTION_TEMPORAL_ID == eOptID ) //if have VCL NAL in current AU, then feedback the temporal ID
+	{
+		iVal = m_pDecContext->iFeedbackTidInAu;
+		*((int*)pOption) = iVal;
+		return cmResultSuccess;
+	}
+	else if ( DECODER_OPTION_MODE == eOptID )
+	{
+		if ( pOption == NULL )
+			return cmInitParaError;
+		
+		iVal = m_pDecContext->iSetMode;
+		
+		*((int *)pOption) = iVal;
+		return cmResultSuccess;
+	}
+	else if ( DECODER_OPTION_DEVICE_INFO == eOptID )
+	{
+		if ( pOption == NULL )
+			return cmInitParaError;
+
+		return cmResultSuccess;
+	}
+	
+	return cmInitParaError;
+}
+
+DECODING_STATE CWelsDecoder::DecodeFrame(	const unsigned char* kpSrc,
+											const int kiSrcLen,	
+											void_t ** ppDst,
+											SBufferInfo* pDstInfo)
+{
+	if ( kiSrcLen > 0 && kpSrc != NULL )
+	{		
+#ifdef OUTPUT_BIT_STREAM
+		if ( m_pFBS )
+		{
+			WelsFwrite( kpSrc, sizeof(unsigned char), kiSrcLen, m_pFBS );
+			WelsFflush( m_pFBS );
+		}
+		if ( m_pFBSSize )
+		{
+			WelsFwrite( &kiSrcLen, sizeof(int), 1, m_pFBSSize );
+			WelsFflush( m_pFBSSize );
+		}
+#endif//OUTPUT_BIT_STREAM
+		m_pDecContext->bEndOfStreamFlag = false;
+	}
+	else  
+	{   //For application MODE, the error detection should be added for safe.
+		//But for CONSOLE MODE, when decoding LAST AU, kiSrcLen==0 && kpSrc==NULL. 
+		m_pDecContext->bEndOfStreamFlag = true;
+	}
+		
+	ppDst[0] = ppDst[1] = ppDst[2] = NULL;
+	m_pDecContext->iErrorCode             = dsErrorFree; //initialize at the starting of AU decoding.
+	m_pDecContext->iFeedbackVclNalInAu = FEEDBACK_UNKNOWN_NAL; //initialize
+	memset(pDstInfo,0,sizeof(SBufferInfo));
+	pDstInfo->eBufferProperty = (EBufferProperty)m_pDecContext->iDecoderOutputProperty;
+
+#ifdef LONG_TERM_REF
+	m_pDecContext->bReferenceLostAtT0Flag       = false; //initialize for LTR
+	m_pDecContext->bCurAuContainLtrMarkSeFlag = false;
+	m_pDecContext->iFrameNumOfAuMarkedLtr      = 0;
+	m_pDecContext->iFrameNum                       = -1; //initialize
+#endif
+
+	m_pDecContext->iFeedbackTidInAu             = -1; //initialize
+	
+	WelsDecodeBs( m_pDecContext, kpSrc, kiSrcLen, (unsigned char**)ppDst, pDstInfo); //iErrorCode has been modified in this function
+	
+	pDstInfo->eWorkMode = (EDecodeMode)m_pDecContext->iDecoderMode;
+
+	if ( m_pDecContext->iErrorCode )
+	{		
+		ENalUnitType eNalType = NAL_UNIT_UNSPEC_0;	//for NBR, IDR frames are expected to decode as followed if error decoding an IDR currently		
+
+		eNalType	= m_pDecContext->sCurNalHead.eNalUnitType;
+		
+		//for AVC bitstream (excluding AVC with temporal scalability, including TP), as long as error occur, SHOULD notify upper layer key frame loss.
+		if ( (IS_PARAM_SETS_NALS(eNalType) || NAL_UNIT_CODED_SLICE_IDR == eNalType) ||
+			(VIDEO_BITSTREAM_AVC == m_pDecContext->eVideoType) )
+		{
+#ifdef LONG_TERM_REF
+			m_pDecContext->bParamSetsLostFlag = true;
+#else
+			m_pDecContext->bReferenceLostAtT0Flag = true;
+#endif
+			ResetParameterSetsState( m_pDecContext ); //initial SPS&PPS ready flag
+		}		
+
+		IWelsTrace::WelsVTrace(m_pTrace, IWelsTrace::WELS_LOG_INFO, "decode failed, failure type:%d \n", m_pDecContext->iErrorCode);
+		return (DECODING_STATE)m_pDecContext->iErrorCode;
+	}
+
+	return dsErrorFree;
+}
+
+DECODING_STATE CWelsDecoder::DecodeFrame(	const unsigned char* kpSrc,
+										   const int kiSrcLen,	
+										   unsigned char** ppDst,
+										   int* pStride,
+										   int& iWidth,
+										   int& iHeight )
+{
+	DECODING_STATE eDecState = dsErrorFree;
+	SBufferInfo    DstInfo;
+
+	memset(&DstInfo, 0, sizeof(SBufferInfo));
+	DstInfo.UsrData.sSystemBuffer.iStride[0] = pStride[0];
+	DstInfo.UsrData.sSystemBuffer.iStride[1] = pStride[1];
+	DstInfo.UsrData.sSystemBuffer.iWidth = iWidth;
+	DstInfo.UsrData.sSystemBuffer.iHeight = iHeight;
+	DstInfo.eBufferProperty = BUFFER_HOST;
+
+	eDecState = DecodeFrame(kpSrc, kiSrcLen, (void_t **)ppDst, &DstInfo);
+	if (eDecState == dsErrorFree)
+	{
+		pStride[0] = DstInfo.UsrData.sSystemBuffer.iStride[0];
+		pStride[1] = DstInfo.UsrData.sSystemBuffer.iStride[1];
+		iWidth     = DstInfo.UsrData.sSystemBuffer.iWidth;
+		iHeight    = DstInfo.UsrData.sSystemBuffer.iHeight;
+	}
+
+	return eDecState;
+}
+
+DECODING_STATE CWelsDecoder::DecodeFrameEx(const unsigned char * kpSrc,
+		                                  const int kiSrcLen,
+		                                  unsigned char * pDst,
+										  int iDstStride,
+		                                  int & iDstLen,
+		                                  int & iWidth,
+		                                  int & iHeight,
+		                                  int & iColorFormat	)
+{
+	DECODING_STATE	 state = dsErrorFree;
+
+    return state;
+}
+
+
+} // namespace WelsDec
+
+
+using namespace WelsDec;
+
+/* WINAPI is indeed in prefix due to sync to application layer callings!! */
+
+/*
+*	CreateDecoder
+*	@return:	success in return 0, otherwise failed.
+*/
+long CreateDecoder( ISVCDecoder** ppDecoder )
+{
+
+	if ( NULL == ppDecoder ){		
+		return ERR_INVALID_PARAMETERS;
+	}
+
+	*ppDecoder	= new CWelsDecoder();
+
+	if ( NULL == *ppDecoder ){		
+		return ERR_MALLOC_FAILED;
+	}	
+
+	return ERR_NONE;
+}
+
+/*
+*	DestroyDecoder
+*/
+void_t DestroyDecoder( ISVCDecoder* pDecoder )
+{	
+	if ( NULL != pDecoder ){
+		delete (CWelsDecoder *)pDecoder;
+		pDecoder = NULL;
+	}
+}
--- /dev/null
+++ b/codec/decoder/plus/src/wels_dec_export.def
@@ -1,0 +1,3 @@
+EXPORTS
+    CreateDecoder
+    DestroyDecoder
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/asm/asm_inc.asm
@@ -1,0 +1,235 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  sse2inc.asm
+;*
+;*  Abstract
+;*      macro and constant
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+;***********************************************************************
+; Options, for DEBUG
+;***********************************************************************
+
+%if 1 
+	%define MOVDQ movdqa
+%else
+	%define MOVDQ movdqu
+%endif
+
+%if 1
+	%define WELSEMMS	emms
+%else
+	%define WELSEMMS
+%endif
+
+BITS 32
+
+;***********************************************************************
+; Macros 
+;***********************************************************************
+
+%macro WELS_EXTERN 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro WELS_AbsW 2
+	pxor        %2, %2
+    psubw       %2, %1
+    pmaxsw      %1, %2
+%endmacro 	
+
+%macro MMX_XSwap  4
+    movq		%4, %2
+    punpckh%1   %4, %3
+    punpckl%1   %2, %3
+%endmacro
+
+; pOut mm1, mm4, mm5, mm3
+%macro MMX_Trans4x4W 5
+    MMX_XSwap wd, %1, %2, %5
+    MMX_XSwap wd, %3, %4, %2
+    MMX_XSwap dq, %1, %3, %4
+    MMX_XSwap dq, %5, %2, %3
+%endmacro
+
+;for TRANSPOSE
+%macro SSE2_XSawp 4
+    movdqa      %4, %2
+    punpckl%1   %2, %3
+    punpckh%1   %4, %3
+%endmacro
+
+; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
+%macro SSE2_Trans4x4D 5
+    SSE2_XSawp dq,  %1, %2, %5
+    SSE2_XSawp dq,  %3, %4, %2
+    SSE2_XSawp qdq, %1, %3, %4
+    SSE2_XSawp qdq, %5, %2, %3
+%endmacro
+
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+%macro SSE2_TransTwo4x4W 5
+    SSE2_XSawp wd,  %1, %2, %5
+    SSE2_XSawp wd,  %3, %4, %2
+    SSE2_XSawp dq,  %1, %3, %4
+    SSE2_XSawp dq,  %5, %2, %3
+    SSE2_XSawp qdq, %1, %5, %2
+    SSE2_XSawp qdq, %4, %3, %5
+%endmacro
+
+;in:  m1, m2, m3, m4, m5, m6, m7, m8
+;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+%macro SSE2_TransTwo8x8B 9
+	movdqa	%9,	%8
+	SSE2_XSawp bw,  %1, %2, %8
+	SSE2_XSawp bw,  %3, %4, %2
+	SSE2_XSawp bw,  %5, %6, %4
+	movdqa	%6, %9
+	movdqa	%9, %4
+	SSE2_XSawp bw,  %7, %6, %4
+	
+	SSE2_XSawp wd,  %1, %3, %6	
+	SSE2_XSawp wd,  %8, %2, %3
+	SSE2_XSawp wd,  %5, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %3	
+	SSE2_XSawp wd,  %7, %4, %3
+	
+	SSE2_XSawp dq,  %1, %5, %4	
+	SSE2_XSawp dq,  %6, %2, %5
+	SSE2_XSawp dq,  %8, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %5		
+	SSE2_XSawp dq,  %7, %3, %5
+	
+	SSE2_XSawp qdq,  %1, %8, %3
+	SSE2_XSawp qdq,  %4, %2, %8
+	SSE2_XSawp qdq,  %6, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %1		
+	SSE2_XSawp qdq,  %7, %5, %1
+	movdqa	%5, %9
+%endmacro
+
+;xmm0, xmm6, xmm7, [eax], [ecx]
+;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
+%macro SSE2_LoadDiff8P 5
+    movq         %1, %4
+    punpcklbw    %1, %3
+    movq         %2, %5
+    punpcklbw    %2, %3
+    psubw        %1, %2
+%endmacro
+
+; m2 = m1 + m2, m1 = m1 - m2
+%macro SSE2_SumSub 3
+	movdqa  %3, %2
+    paddw   %2, %1
+    psubw   %1, %3
+%endmacro
+
+
+%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+	mov %3h, %3l
+	movd %1, e%3x		; i.e, 1% = eax (=b0)
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
+%endmacro  
+
+;copy a dw into a xmm for 8 times
+%macro  SSE2_Copy8Times 2
+		movd	%1, %2
+		punpcklwd %1, %1
+		pshufd	%1,	%1,	0
+%endmacro
+
+;copy a db into a xmm for 16 times
+%macro  SSE2_Copy16Times 2
+		movd		%1, %2
+		pshuflw		%1, %1, 0
+		punpcklqdq	%1, %1
+		packuswb	%1,	%1
+%endmacro
+
+
+
+;***********************************************************************
+;preprocessor constants
+;***********************************************************************
+;dw 32,32,32,32,32,32,32,32 for xmm
+;dw 32,32,32,32 for mm
+%macro WELS_DW32 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	psllw %1,5
+%endmacro
+
+;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
+;dw 1, 1, 1, 1 for mm
+%macro WELS_DW1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+%endmacro
+
+;all 0 for xmm and mm
+%macro	WELS_Zero 1
+	pxor %1, %1
+%endmacro
+
+;dd 1, 1, 1, 1 for xmm
+;dd 1, 1 for mm
+%macro WELS_DD1 1
+	pcmpeqw %1,%1
+	psrld %1,31
+%endmacro
+
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+%macro WELS_DB1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	packuswb %1,%1
+%endmacro
+
+
+
+
+
+
--- /dev/null
+++ b/codec/encoder/core/asm/coeff.asm
@@ -1,0 +1,459 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  memzero.asm
+;*
+;*  Abstract
+;*     cavlc
+;*
+;*  History
+;*      09/08/2010 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+
+
+SECTION .rodata align=16
+
+align 16
+sse2_b8 db 8, 8, 8, 8, 8, 8, 8, 8
+
+ALIGN  16
+sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
+
+align 16
+byte_1pos_table:
+	db 0,0,0,0,0,0,0,0, ;0
+	db 0,0,0,0,0,0,0,1, ;1
+	db 1,0,0,0,0,0,0,1, ;2
+	db 1,0,0,0,0,0,0,2, ;3
+	db 2,0,0,0,0,0,0,1, ;4
+	db 2,0,0,0,0,0,0,2, ;5
+	db 2,1,0,0,0,0,0,2, ;6
+	db 2,1,0,0,0,0,0,3, ;7
+	db 3,0,0,0,0,0,0,1, ;8
+	db 3,0,0,0,0,0,0,2, ;9
+	db 3,1,0,0,0,0,0,2, ;10
+	db 3,1,0,0,0,0,0,3, ;11
+	db 3,2,0,0,0,0,0,2, ;12
+	db 3,2,0,0,0,0,0,3, ;13
+	db 3,2,1,0,0,0,0,3, ;14
+	db 3,2,1,0,0,0,0,4, ;15
+	db 4,0,0,0,0,0,0,1, ;16
+	db 4,0,0,0,0,0,0,2, ;17
+	db 4,1,0,0,0,0,0,2, ;18
+	db 4,1,0,0,0,0,0,3, ;19
+	db 4,2,0,0,0,0,0,2, ;20
+	db 4,2,0,0,0,0,0,3, ;21
+	db 4,2,1,0,0,0,0,3, ;22
+	db 4,2,1,0,0,0,0,4, ;23
+	db 4,3,0,0,0,0,0,2, ;24
+	db 4,3,0,0,0,0,0,3, ;25
+	db 4,3,1,0,0,0,0,3, ;26
+	db 4,3,1,0,0,0,0,4, ;27
+	db 4,3,2,0,0,0,0,3, ;28
+	db 4,3,2,0,0,0,0,4, ;29
+	db 4,3,2,1,0,0,0,4, ;30
+	db 4,3,2,1,0,0,0,5, ;31
+	db 5,0,0,0,0,0,0,1, ;32
+	db 5,0,0,0,0,0,0,2, ;33
+	db 5,1,0,0,0,0,0,2, ;34
+	db 5,1,0,0,0,0,0,3, ;35
+	db 5,2,0,0,0,0,0,2, ;36
+	db 5,2,0,0,0,0,0,3, ;37
+	db 5,2,1,0,0,0,0,3, ;38
+	db 5,2,1,0,0,0,0,4, ;39
+	db 5,3,0,0,0,0,0,2, ;40
+	db 5,3,0,0,0,0,0,3, ;41
+	db 5,3,1,0,0,0,0,3, ;42
+	db 5,3,1,0,0,0,0,4, ;43
+	db 5,3,2,0,0,0,0,3, ;44
+	db 5,3,2,0,0,0,0,4, ;45
+	db 5,3,2,1,0,0,0,4, ;46
+	db 5,3,2,1,0,0,0,5, ;47
+	db 5,4,0,0,0,0,0,2, ;48
+	db 5,4,0,0,0,0,0,3, ;49
+	db 5,4,1,0,0,0,0,3, ;50
+	db 5,4,1,0,0,0,0,4, ;51
+	db 5,4,2,0,0,0,0,3, ;52
+	db 5,4,2,0,0,0,0,4, ;53
+	db 5,4,2,1,0,0,0,4, ;54
+	db 5,4,2,1,0,0,0,5, ;55
+	db 5,4,3,0,0,0,0,3, ;56
+	db 5,4,3,0,0,0,0,4, ;57
+	db 5,4,3,1,0,0,0,4, ;58
+	db 5,4,3,1,0,0,0,5, ;59
+	db 5,4,3,2,0,0,0,4, ;60
+	db 5,4,3,2,0,0,0,5, ;61
+	db 5,4,3,2,1,0,0,5, ;62
+	db 5,4,3,2,1,0,0,6, ;63
+	db 6,0,0,0,0,0,0,1, ;64
+	db 6,0,0,0,0,0,0,2, ;65
+	db 6,1,0,0,0,0,0,2, ;66
+	db 6,1,0,0,0,0,0,3, ;67
+	db 6,2,0,0,0,0,0,2, ;68
+	db 6,2,0,0,0,0,0,3, ;69
+	db 6,2,1,0,0,0,0,3, ;70
+	db 6,2,1,0,0,0,0,4, ;71
+	db 6,3,0,0,0,0,0,2, ;72
+	db 6,3,0,0,0,0,0,3, ;73
+	db 6,3,1,0,0,0,0,3, ;74
+	db 6,3,1,0,0,0,0,4, ;75
+	db 6,3,2,0,0,0,0,3, ;76
+	db 6,3,2,0,0,0,0,4, ;77
+	db 6,3,2,1,0,0,0,4, ;78
+	db 6,3,2,1,0,0,0,5, ;79
+	db 6,4,0,0,0,0,0,2, ;80
+	db 6,4,0,0,0,0,0,3, ;81
+	db 6,4,1,0,0,0,0,3, ;82
+	db 6,4,1,0,0,0,0,4, ;83
+	db 6,4,2,0,0,0,0,3, ;84
+	db 6,4,2,0,0,0,0,4, ;85
+	db 6,4,2,1,0,0,0,4, ;86
+	db 6,4,2,1,0,0,0,5, ;87
+	db 6,4,3,0,0,0,0,3, ;88
+	db 6,4,3,0,0,0,0,4, ;89
+	db 6,4,3,1,0,0,0,4, ;90
+	db 6,4,3,1,0,0,0,5, ;91
+	db 6,4,3,2,0,0,0,4, ;92
+	db 6,4,3,2,0,0,0,5, ;93
+	db 6,4,3,2,1,0,0,5, ;94
+	db 6,4,3,2,1,0,0,6, ;95
+	db 6,5,0,0,0,0,0,2, ;96
+	db 6,5,0,0,0,0,0,3, ;97
+	db 6,5,1,0,0,0,0,3, ;98
+	db 6,5,1,0,0,0,0,4, ;99
+	db 6,5,2,0,0,0,0,3, ;100
+	db 6,5,2,0,0,0,0,4, ;101
+	db 6,5,2,1,0,0,0,4, ;102
+	db 6,5,2,1,0,0,0,5, ;103
+	db 6,5,3,0,0,0,0,3, ;104
+	db 6,5,3,0,0,0,0,4, ;105
+	db 6,5,3,1,0,0,0,4, ;106
+	db 6,5,3,1,0,0,0,5, ;107
+	db 6,5,3,2,0,0,0,4, ;108
+	db 6,5,3,2,0,0,0,5, ;109
+	db 6,5,3,2,1,0,0,5, ;110
+	db 6,5,3,2,1,0,0,6, ;111
+	db 6,5,4,0,0,0,0,3, ;112
+	db 6,5,4,0,0,0,0,4, ;113
+	db 6,5,4,1,0,0,0,4, ;114
+	db 6,5,4,1,0,0,0,5, ;115
+	db 6,5,4,2,0,0,0,4, ;116
+	db 6,5,4,2,0,0,0,5, ;117
+	db 6,5,4,2,1,0,0,5, ;118
+	db 6,5,4,2,1,0,0,6, ;119
+	db 6,5,4,3,0,0,0,4, ;120
+	db 6,5,4,3,0,0,0,5, ;121
+	db 6,5,4,3,1,0,0,5, ;122
+	db 6,5,4,3,1,0,0,6, ;123
+	db 6,5,4,3,2,0,0,5, ;124
+	db 6,5,4,3,2,0,0,6, ;125
+	db 6,5,4,3,2,1,0,6, ;126
+	db 6,5,4,3,2,1,0,7, ;127
+	db 7,0,0,0,0,0,0,1, ;128
+	db 7,0,0,0,0,0,0,2, ;129
+	db 7,1,0,0,0,0,0,2, ;130
+	db 7,1,0,0,0,0,0,3, ;131
+	db 7,2,0,0,0,0,0,2, ;132
+	db 7,2,0,0,0,0,0,3, ;133
+	db 7,2,1,0,0,0,0,3, ;134
+	db 7,2,1,0,0,0,0,4, ;135
+	db 7,3,0,0,0,0,0,2, ;136
+	db 7,3,0,0,0,0,0,3, ;137
+	db 7,3,1,0,0,0,0,3, ;138
+	db 7,3,1,0,0,0,0,4, ;139
+	db 7,3,2,0,0,0,0,3, ;140
+	db 7,3,2,0,0,0,0,4, ;141
+	db 7,3,2,1,0,0,0,4, ;142
+	db 7,3,2,1,0,0,0,5, ;143
+	db 7,4,0,0,0,0,0,2, ;144
+	db 7,4,0,0,0,0,0,3, ;145
+	db 7,4,1,0,0,0,0,3, ;146
+	db 7,4,1,0,0,0,0,4, ;147
+	db 7,4,2,0,0,0,0,3, ;148
+	db 7,4,2,0,0,0,0,4, ;149
+	db 7,4,2,1,0,0,0,4, ;150
+	db 7,4,2,1,0,0,0,5, ;151
+	db 7,4,3,0,0,0,0,3, ;152
+	db 7,4,3,0,0,0,0,4, ;153
+	db 7,4,3,1,0,0,0,4, ;154
+	db 7,4,3,1,0,0,0,5, ;155
+	db 7,4,3,2,0,0,0,4, ;156
+	db 7,4,3,2,0,0,0,5, ;157
+	db 7,4,3,2,1,0,0,5, ;158
+	db 7,4,3,2,1,0,0,6, ;159
+	db 7,5,0,0,0,0,0,2, ;160
+	db 7,5,0,0,0,0,0,3, ;161
+	db 7,5,1,0,0,0,0,3, ;162
+	db 7,5,1,0,0,0,0,4, ;163
+	db 7,5,2,0,0,0,0,3, ;164
+	db 7,5,2,0,0,0,0,4, ;165
+	db 7,5,2,1,0,0,0,4, ;166
+	db 7,5,2,1,0,0,0,5, ;167
+	db 7,5,3,0,0,0,0,3, ;168
+	db 7,5,3,0,0,0,0,4, ;169
+	db 7,5,3,1,0,0,0,4, ;170
+	db 7,5,3,1,0,0,0,5, ;171
+	db 7,5,3,2,0,0,0,4, ;172
+	db 7,5,3,2,0,0,0,5, ;173
+	db 7,5,3,2,1,0,0,5, ;174
+	db 7,5,3,2,1,0,0,6, ;175
+	db 7,5,4,0,0,0,0,3, ;176
+	db 7,5,4,0,0,0,0,4, ;177
+	db 7,5,4,1,0,0,0,4, ;178
+	db 7,5,4,1,0,0,0,5, ;179
+	db 7,5,4,2,0,0,0,4, ;180
+	db 7,5,4,2,0,0,0,5, ;181
+	db 7,5,4,2,1,0,0,5, ;182
+	db 7,5,4,2,1,0,0,6, ;183
+	db 7,5,4,3,0,0,0,4, ;184
+	db 7,5,4,3,0,0,0,5, ;185
+	db 7,5,4,3,1,0,0,5, ;186
+	db 7,5,4,3,1,0,0,6, ;187
+	db 7,5,4,3,2,0,0,5, ;188
+	db 7,5,4,3,2,0,0,6, ;189
+	db 7,5,4,3,2,1,0,6, ;190
+	db 7,5,4,3,2,1,0,7, ;191
+	db 7,6,0,0,0,0,0,2, ;192
+	db 7,6,0,0,0,0,0,3, ;193
+	db 7,6,1,0,0,0,0,3, ;194
+	db 7,6,1,0,0,0,0,4, ;195
+	db 7,6,2,0,0,0,0,3, ;196
+	db 7,6,2,0,0,0,0,4, ;197
+	db 7,6,2,1,0,0,0,4, ;198
+	db 7,6,2,1,0,0,0,5, ;199
+	db 7,6,3,0,0,0,0,3, ;200
+	db 7,6,3,0,0,0,0,4, ;201
+	db 7,6,3,1,0,0,0,4, ;202
+	db 7,6,3,1,0,0,0,5, ;203
+	db 7,6,3,2,0,0,0,4, ;204
+	db 7,6,3,2,0,0,0,5, ;205
+	db 7,6,3,2,1,0,0,5, ;206
+	db 7,6,3,2,1,0,0,6, ;207
+	db 7,6,4,0,0,0,0,3, ;208
+	db 7,6,4,0,0,0,0,4, ;209
+	db 7,6,4,1,0,0,0,4, ;210
+	db 7,6,4,1,0,0,0,5, ;211
+	db 7,6,4,2,0,0,0,4, ;212
+	db 7,6,4,2,0,0,0,5, ;213
+	db 7,6,4,2,1,0,0,5, ;214
+	db 7,6,4,2,1,0,0,6, ;215
+	db 7,6,4,3,0,0,0,4, ;216
+	db 7,6,4,3,0,0,0,5, ;217
+	db 7,6,4,3,1,0,0,5, ;218
+	db 7,6,4,3,1,0,0,6, ;219
+	db 7,6,4,3,2,0,0,5, ;220
+	db 7,6,4,3,2,0,0,6, ;221
+	db 7,6,4,3,2,1,0,6, ;222
+	db 7,6,4,3,2,1,0,7, ;223
+	db 7,6,5,0,0,0,0,3, ;224
+	db 7,6,5,0,0,0,0,4, ;225
+	db 7,6,5,1,0,0,0,4, ;226
+	db 7,6,5,1,0,0,0,5, ;227
+	db 7,6,5,2,0,0,0,4, ;228
+	db 7,6,5,2,0,0,0,5, ;229
+	db 7,6,5,2,1,0,0,5, ;230
+	db 7,6,5,2,1,0,0,6, ;231
+	db 7,6,5,3,0,0,0,4, ;232
+	db 7,6,5,3,0,0,0,5, ;233
+	db 7,6,5,3,1,0,0,5, ;234
+	db 7,6,5,3,1,0,0,6, ;235
+	db 7,6,5,3,2,0,0,5, ;236
+	db 7,6,5,3,2,0,0,6, ;237
+	db 7,6,5,3,2,1,0,6, ;238
+	db 7,6,5,3,2,1,0,7, ;239
+	db 7,6,5,4,0,0,0,4, ;240
+	db 7,6,5,4,0,0,0,5, ;241
+	db 7,6,5,4,1,0,0,5, ;242
+	db 7,6,5,4,1,0,0,6, ;243
+	db 7,6,5,4,2,0,0,5, ;244
+	db 7,6,5,4,2,0,0,6, ;245
+	db 7,6,5,4,2,1,0,6, ;246
+	db 7,6,5,4,2,1,0,7, ;247
+	db 7,6,5,4,3,0,0,5, ;248
+	db 7,6,5,4,3,0,0,6, ;249
+	db 7,6,5,4,3,1,0,6, ;250
+	db 7,6,5,4,3,1,0,7, ;251
+	db 7,6,5,4,3,2,0,6, ;252
+	db 7,6,5,4,3,2,0,7, ;253
+	db 7,6,5,4,3,2,1,7, ;254
+	db 7,6,5,4,3,2,1,8, ;255
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+
+	
+;***********************************************************************
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); 
+;***********************************************************************
+WELS_EXTERN CavlcParamCal_sse2
+CavlcParamCal_sse2:
+	push ebx
+	push edi
+	push esi
+	
+	mov			eax,	[esp+16]	;coffLevel
+	mov			edi,	[esp+24]	;Level
+	mov			ebx,	[esp+32]	;endIdx
+	cmp			ebx,	3
+	jne			.Level16	
+	pxor		xmm1,	xmm1
+	movq		xmm0,	[eax]	; removed QWORD
+	jmp			.Cal_begin		
+.Level16:	
+	movdqa		xmm0,	[eax]
+	movdqa		xmm1,	[eax+16]
+.Cal_begin:
+    movdqa		xmm2,	xmm0
+	packsswb	xmm0,	xmm1
+	movdqa		xmm4,	xmm0
+	pxor		xmm3,	xmm3
+	pcmpgtb		xmm0,	xmm3
+	pcmpgtb		xmm3,	xmm4
+	por			xmm0,	xmm3
+	pmovmskb	edx,	xmm0
+	cmp			edx,	0
+	je near   .return
+	movdqa		xmm6,	[sse2_b_1]
+	pcmpeqw		xmm7,	xmm7	;generate -1
+    mov			ebx,	0xff
+    ;pinsrw		xmm6,	ebx,	3
+   
+    mov       bl,   dh
+
+	lea       ebx,  [byte_1pos_table+8*ebx]
+	movq      xmm0, [ebx]
+	pextrw    ecx,  xmm0, 3
+	shr       ecx,  8
+    mov       dh,   cl
+ 
+.loopHighFind0:
+    cmp       ecx,   0
+    je        .loopHighFind0End
+    ;mov       esi, [ebx]
+    ;and       esi, 0xff
+    movzx	  esi, byte [ebx]
+    add       esi, 8
+    mov       esi, [eax+2*esi]
+    mov       [edi], si
+    add       edi,   2 
+    ;add       ebx,   1
+    inc		  ebx
+    dec       ecx
+	jmp       .loopHighFind0
+.loopHighFind0End:
+    mov       cl,   dh
+    cmp       cl,   8
+	pand      xmm0, xmm6
+    jne       .LowByteFind0
+    sub       edi,   2
+    mov       esi,   [eax+16]
+    mov       [edi], esi
+    add       edi,   2
+.LowByteFind0:
+    and       edx,  0xff
+	lea       ebx,  [byte_1pos_table+8*edx]
+	movq      xmm1, [ebx]
+    pextrw    esi,  xmm1, 3
+    or        esi,  0xff
+    or        ecx,  0xff00
+    and       ecx,  esi
+    shr       esi,  8
+    pand      xmm1, xmm6
+.loopLowFind0:
+    cmp       esi, 0
+    je        .loopLowFind0End
+	;mov       edx, [ebx]
+	;and       edx, 0xff
+	movzx	  edx,	byte [ebx]
+	mov       edx, [eax+2*edx]
+	mov       [edi], dx 
+	add       edi,   2 
+	;add       ebx,   1
+	inc		  ebx
+    dec       esi
+	jmp       .loopLowFind0
+.loopLowFind0End:
+    cmp       ch,  8
+    jne       .getLevelEnd
+    sub       edi, 2
+    mov       edx, [eax]
+    mov       [edi], dx
+.getLevelEnd:
+	mov      edx, [esp+28]	;total_coeffs
+    ;mov      ebx,   ecx
+    ;and      ebx,   0xff
+    movzx	 ebx,	byte cl
+    add      cl,    ch
+	mov      [edx], cl
+;getRun
+    movq     xmm5, [sse2_b8]
+    paddb    xmm0, xmm5
+    pxor     xmm2, xmm2
+    pxor     xmm3, xmm3
+    mov      eax,  8
+    sub      eax,  ebx
+    shl      eax,  3
+    shl      ebx,  3
+	pinsrw   xmm2, ebx, 0
+    pinsrw   xmm3, eax, 0
+    psllq    xmm0, xmm3
+    psrlq    xmm0, xmm3
+    movdqa   xmm4, xmm1
+    psllq    xmm1, xmm2 
+    psrlq    xmm4, xmm3 
+    punpcklqdq xmm1, xmm4
+    por      xmm0,  xmm1
+
+    pextrw   eax,   xmm0, 0
+    and		 eax,   0xff
+    inc      eax
+    sub      al,    cl
+	movdqa   xmm1,  xmm0
+	paddb    xmm1,  xmm7
+	psrldq   xmm0,  1
+	psubb    xmm1,  xmm0
+    mov      ecx,   [esp+20] ;run
+	movdqa   [ecx], xmm1
+;getRunEnd
+.return:
+	pop esi
+	pop edi
+	pop ebx
+	ret
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/asm/cpuid.asm
@@ -1,0 +1,169 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	cpu_mmx.asm
+;*
+;*  Abstract
+;*		verify cpuid feature support and cpuid detection
+;*
+;*  History
+;*      04/29/2009	Created
+;*
+;*************************************************************************/
+
+bits 32
+
+;******************************************************************************************
+; Macros
+;******************************************************************************************
+
+%macro WELS_EXTERN 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;******************************************************************************************
+; Code
+;******************************************************************************************
+
+SECTION .text
+
+; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
+; section CPUID - CPU Identification
+
+WELS_EXTERN WelsCPUIdVerify
+ALIGN 16
+;******************************************************************************************
+;   int32_t WelsCPUIdVerify()
+;******************************************************************************************
+WelsCPUIdVerify:
+    pushfd					; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
+	pushfd					; need push 2 EFLAGS, one for processing and the another one for storing purpose
+    pop     ecx				; get EFLAGS to bit manipulation
+    mov     eax, ecx		; store into ecx followed
+    xor     eax, 00200000h	; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
+	xor		eax, ecx		; get the ID flag bitwise, eax - 0: not support; otherwise: support
+    popfd					; store back EFLAGS and keep unchanged for system
+    ret
+
+WELS_EXTERN WelsCPUId
+ALIGN 16
+;****************************************************************************************************
+;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
+;****************************************************************************************************
+WelsCPUId:
+	push	ebx	
+	push	edi
+	
+	mov     eax, [esp+12]	; operating index
+    cpuid					; cpuid
+	
+	; processing various information return
+	mov     edi, [esp+16]
+    mov     [edi], eax
+    mov     edi, [esp+20]
+    mov     [edi], ebx
+    mov     edi, [esp+24]
+    mov     [edi], ecx
+    mov     edi, [esp+28]
+    mov     [edi], edx
+
+	pop		edi	
+    pop     ebx
+	ret
+	
+WELS_EXTERN WelsCPUSupportAVX
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportAVX:
+	mov eax, [esp+4]
+	mov ecx, [esp+8]
+
+	; refer to detection of AVX addressed in INTEL AVX manual document
+	and ecx, 018000000H
+	cmp ecx, 018000000H		; check both OSXSAVE and AVX feature flags
+	jne avx_not_supported
+	; processor supports AVX instructions and XGETBV is enabled by OS
+	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
+	XGETBV					; result in EDX:EAX
+	and eax, 06H
+	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
+	jne avx_not_supported
+	mov eax, 1
+	ret
+avx_not_supported:
+	mov eax, 0
+	ret
+
+WELS_EXTERN WelsCPUSupportFMA
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportFMA:
+	mov eax, [esp+4]
+	mov ecx, [esp+8]
+	
+	; refer to detection of FMA addressed in INTEL AVX manual document
+	and ecx, 018001000H
+	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
+	jne fma_not_supported
+	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
+	XGETBV					; result in EDX:EAX
+	and eax, 06H
+	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
+	jne fma_not_supported
+	mov eax, 1
+	ret
+fma_not_supported:
+	mov eax, 0	
+	ret
+
+WELS_EXTERN WelsEmms
+ALIGN 16
+;******************************************************************************************
+;   void WelsEmms()
+;******************************************************************************************
+WelsEmms:
+	emms	; empty mmx technology states
+	ret
+
+
+
--- /dev/null
+++ b/codec/encoder/core/asm/dct.asm
@@ -1,0 +1,556 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  dct.asm
+;*
+;*  Abstract
+;*      WelsDctFourT4_sse2
+;*
+;*  History
+;*      8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Constant
+;***********************************************************************		
+			
+align 16
+SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16, 
+			dw	10, 13, 10, 13, 13, 16, 13, 16,
+            dw  11, 14, 11, 14, 14, 18, 14, 18, 
+			dw  11, 14, 11, 14, 14, 18, 14, 18,
+			dw  13, 16, 13, 16, 16, 20, 16, 20, 
+			dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  14, 18, 14, 18, 18, 23, 18, 23, 
+			dw  14, 18, 14, 18, 18, 23, 18, 23,
+			dw  16, 20, 16, 20, 20, 25, 20, 25, 
+			dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  18, 23, 18, 23, 23, 29, 23, 29, 
+			dw  18, 23, 18, 23, 23, 29, 23, 29
+			
+
+;***********************************************************************
+; MMX functions
+;***********************************************************************			
+
+%macro MMX_LoadDiff4P 5
+	movd        %1, [%3]
+	movd        %2, [%4]
+	punpcklbw   %1, %5
+	punpcklbw   %2, %5
+	psubw       %1, %2
+%endmacro
+
+%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
+	MMX_LoadDiff4P %1, %9, %5,    %7,    %10
+	MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
+	lea  %5, [%5+2*%6]
+	lea  %7, [%7+2*%8]
+	MMX_LoadDiff4P %3, %9, %5,    %7,    %10
+	MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
+%endmacro
+
+%macro MMX_SumSubMul2 3
+	movq    %3, %1
+	psllw   %1, $1
+	paddw   %1, %2
+	psllw   %2, $1
+    psubw   %3, %2
+%endmacro
+
+%macro MMX_SumSubDiv2 3
+    movq    %3, %2
+    psraw   %3, $1
+    paddw   %3, %1
+    psraw   %1, $1
+    psubw   %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+	movq    %3, %2
+    psubw   %2, %1
+    paddw   %1, %3
+%endmacro
+
+%macro MMX_DCT 6
+    MMX_SumSub		%4, %1, %6
+    MMX_SumSub		%3, %2, %6
+    MMX_SumSub		%3, %4, %6
+    MMX_SumSubMul2  %1, %2, %5  
+%endmacro
+
+%macro MMX_IDCT 6
+    MMX_SumSub      %4, %5, %6
+    MMX_SumSubDiv2  %3, %2, %1
+    MMX_SumSub		%1, %4, %6
+	MMX_SumSub		%3, %5, %6
+%endmacro
+
+%macro MMX_StoreDiff4P 6
+    movd       %2, %6
+    punpcklbw  %2, %4
+    paddw      %1, %3
+    psraw      %1, $6
+    paddsw     %1, %2
+    packuswb   %1, %2
+    movd       %5, %1
+%endmacro
+
+ALIGN 16
+;***********************************************************************
+;   void __cdecl WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
+;***********************************************************************
+WELS_EXTERN WelsDctT4_mmx
+WelsDctT4_mmx:
+    push    ebx
+    mov     eax, [esp+12]   ; pix1
+    mov     ebx, [esp+16]   ; i_pix1
+    mov     ecx, [esp+20]   ; pix2
+    mov     edx, [esp+24]   ; i_pix2
+
+    WELS_Zero    mm7
+    
+    MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
+
+    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6           
+    MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
+    
+    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6                    
+    MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
+
+    mov     eax, [esp+ 8]   ; pDct
+    movq    [eax+ 0],   mm2
+    movq    [eax+ 8],   mm1
+    movq    [eax+16],   mm5
+    movq    [eax+24],   mm4
+
+	WELSEMMS
+    pop     ebx
+    ret
+
+
+;***********************************************************************
+;   void __cdecl WelsIDctT4Rec_mmx(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs)
+;***********************************************************************
+WELS_EXTERN WelsIDctT4Rec_mmx
+WelsIDctT4Rec_mmx:
+	push   ebx
+%define	pushsize	4
+%define     p_dst       esp+pushsize+4
+%define     i_dst       esp+pushsize+8
+%define     p_pred      esp+pushsize+12
+%define     i_pred      esp+pushsize+16
+%define     pDct        esp+pushsize+20
+
+	mov     eax, [pDct   ] 
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+    mov     edx, [p_dst ]   
+    mov     ecx, [i_dst ]   
+    mov     eax, [p_pred]
+    mov     ebx, [i_pred]     
+
+	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
+	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W		mm1, mm3, mm0, mm4, mm2
+	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+
+    WELS_Zero			mm7
+    WELS_DW32			mm6
+    
+    MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [edx], [eax]
+    MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
+    lea     edx, [edx+2*ecx]
+    lea     eax, [eax+2*ebx]
+    MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [edx], [eax]
+    MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
+    
+	WELSEMMS
+%undef	pushsize
+%undef  p_dst
+%undef  i_dst
+%undef  p_pred
+%undef  i_pred
+%undef  pDct
+    pop ebx
+    ret
+
+
+;***********************************************************************
+; SSE2 functions
+;***********************************************************************
+%macro SSE2_Store4x8p 6
+	SSE2_XSawp qdq, %2, %3, %6
+	SSE2_XSawp qdq, %4, %5, %3
+	MOVDQ    [%1+0x00], %2 
+	MOVDQ    [%1+0x10], %4 
+	MOVDQ    [%1+0x20], %6 
+	MOVDQ    [%1+0x30], %3 
+%endmacro
+
+%macro SSE2_Load4x8p 6
+	MOVDQ    %2,	[%1+0x00]
+	MOVDQ    %4,	[%1+0x10]  
+	MOVDQ    %6,	[%1+0x20]  
+	MOVDQ    %3,	[%1+0x30]  
+	SSE2_XSawp qdq, %4, %3, %5
+	SSE2_XSawp qdq, %2, %6, %3
+%endmacro
+
+%macro SSE2_SumSubMul2 3
+    movdqa  %3, %1
+    paddw   %1, %1
+    paddw   %1, %2
+    psubw   %3, %2
+    psubw   %3, %2
+%endmacro
+
+%macro SSE2_SumSubDiv2 4
+    movdqa  %4, %1
+    movdqa  %3, %2
+    psraw   %2, $1
+    psraw   %4, $1
+    paddw   %1, %2
+    psubw   %4, %3
+%endmacro
+
+%macro SSE2_StoreDiff8p 6
+    paddw       %1, %3
+    psraw       %1, $6
+    movq		%2, %6
+    punpcklbw   %2, %4
+    paddsw      %2, %1
+    packuswb    %2, %2
+    movq	    %5, %2
+%endmacro
+
+%macro SSE2_StoreDiff8p 5
+    movq		%2, %5
+    punpcklbw   %2, %3
+    paddsw      %2, %1
+    packuswb    %2, %2
+    movq	    %4, %2
+%endmacro
+
+%macro SSE2_Load8DC	6
+	movdqa		%1,		%6		; %1 = dc0 dc1	
+	paddw       %1,		%5
+    psraw       %1,		$6		; (dc + 32) >> 6	
+    
+    movdqa		%2,		%1
+    psrldq		%2,		4
+ 	punpcklwd	%2,		%2
+	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3	   
+
+    movdqa		%3,		%1
+    psrldq		%3,		8
+ 	punpcklwd	%3,		%3
+	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
+	
+	movdqa		%4,		%1
+    psrldq		%4,		12
+ 	punpcklwd	%4,		%4
+	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
+	    	
+	punpcklwd	%1,		%1
+	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1	
+%endmacro
+
+%macro SSE2_DCT 6
+    SSE2_SumSub		%6, %3,	%5						
+	SSE2_SumSub		%1, %2, %5																		
+	SSE2_SumSub		%3, %2, %5					
+	SSE2_SumSubMul2		%6, %1, %4               	
+%endmacro
+
+%macro SSE2_IDCT 7
+    SSE2_SumSub       %7, %2, %6					
+    SSE2_SumSubDiv2     %1, %3, %5, %4              
+    SSE2_SumSub	     %2, %1, %5 
+    SSE2_SumSub		 %7, %4, %5
+%endmacro
+
+;***********************************************************************
+; void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
+;***********************************************************************
+WELS_EXTERN WelsDctFourT4_sse2
+ALIGN 16
+WelsDctFourT4_sse2:
+    push    ebx
+    push	esi
+    mov		esi, [esp+12] 
+    mov     eax, [esp+16]   ; pix1
+    mov     ebx, [esp+20]   ; i_pix1
+    mov     ecx, [esp+24]   ; pix2
+    mov     edx, [esp+28]   ; i_pix2    
+    
+    pxor    xmm7, xmm7
+
+	;Load 4x8
+	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [eax      ], [ecx]
+    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [eax+ebx  ], [ecx+edx]
+	lea		eax, [eax + 2 * ebx]
+	lea		ecx, [ecx + 2 * edx]
+	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
+    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
+	
+	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
+	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2             		
+	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
+	
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5  
+	
+	lea		eax, [eax + 2 * ebx]
+	lea		ecx, [ecx + 2 * edx]
+    
+	;Load 4x8
+	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [eax      ], [ecx    ]
+    SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [eax+ebx  ], [ecx+edx]
+	lea		eax, [eax + 2 * ebx]
+	lea		ecx, [ecx + 2 * edx]	
+    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
+    SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
+	
+	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1		
+    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2              		
+	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
+	
+	lea		esi, [esi+64]
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5 
+	
+    pop esi
+    pop ebx
+    ret
+
+
+%define		rec			esp + pushsize + 4
+%define		stride		esp + pushsize + 8
+%define		pred		esp + pushsize + 12
+%define		pred_stride	esp + pushsize + 16
+%define		rs			esp + pushsize + 20
+;***********************************************************************
+; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
+;***********************************************************************
+WELS_EXTERN WelsIDctFourT4Rec_sse2
+ALIGN 16
+WelsIDctFourT4Rec_sse2:
+%define	pushsize	8
+    push		ebx
+    push		esi
+    
+    mov			eax,		[rec]   
+    mov			ebx,		[stride]   
+    mov			ecx,		[pred]  
+    mov			edx,		[pred_stride]   
+    mov			esi,		[rs]  
+
+	;Load 4x8
+	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
+	
+	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
+  	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+    SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
+    SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+    
+	WELS_Zero			xmm7
+    WELS_DW32			xmm6
+
+	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
+	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
+	lea		eax, [eax + 2 * ebx]
+	lea		ecx, [ecx + 2 * edx]	
+	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
+   
+    add		esi, 64
+	lea		eax, [eax + 2 * ebx]
+	lea		ecx, [ecx + 2 * edx]
+   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
+	
+	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
+	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0           
+    SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
+	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+
+	WELS_Zero			xmm7
+    WELS_DW32			xmm6
+    
+	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
+	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
+	lea		eax, [eax + 2 * ebx]
+	lea		ecx, [ecx + 2 * edx]	
+	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx] 
+
+    pop		esi
+    pop		ebx
+    ret
+    
+  %macro SSE2_StoreDiff4x8p 8
+   	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
+	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]	
+	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
+	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]	
+ %endmacro
+ 
+ ;***********************************************************************
+; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
+;***********************************************************************
+WELS_EXTERN WelsIDctRecI16x16Dc_sse2
+ALIGN 16
+%define		pushsize	8
+%define		luma_dc		esp + pushsize + 20
+WelsIDctRecI16x16Dc_sse2:
+    push		esi
+    push		edi
+    
+	mov			ecx,		[luma_dc]
+    mov			eax,		[rec]	
+    mov			edx,		[stride]	
+    mov			esi,		[pred]	
+    mov			edi,		[pred_stride]	    	
+	pxor		xmm7,		xmm7
+    WELS_DW32	xmm6
+    
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+	
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]	
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
+	  
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]	
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+	
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]		
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+	
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]	
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]		
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+	
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]	
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
+	  
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]	 
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+	
+	lea			eax,		[eax + 2 * edx]
+	lea			esi,		[esi + 2 * edi]		
+	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+		
+    pop		edi
+    pop		esi
+    ret
+
+
+
+%macro SSE2_SumSubD 3
+	movdqa  %3, %2
+    paddd   %2, %1
+    psubd   %1, %3
+%endmacro
+
+%macro SSE2_SumSubDiv2D 4
+	paddd   %1, %2
+	paddd	%1, %3
+	psrad	%1,	 1
+	movdqa	%4, %1
+	psubd	%4, %2
+%endmacro
+
+ %macro		SSE2_Load4Col	5
+	movsx		edx,		WORD[%5]
+ 	movd		%1,			edx
+ 	movsx		edx,		WORD[%5 + 0x20]
+ 	movd		%2,			edx
+	punpckldq	%1,			%2
+	movsx		edx,		WORD[%5 + 0x80]
+ 	movd		%3,			edx
+	movsx		edx,		WORD[%5 + 0xa0]
+ 	movd		%4,			edx
+	punpckldq	%3,			%4
+	punpcklqdq	%1,			%3
+ %endmacro
+ 
+;***********************************************************************
+;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
+;***********************************************************************
+WELS_EXTERN WelsHadamardT4Dc_sse2
+WelsHadamardT4Dc_sse2:
+		mov			eax,		[esp + 4]	; luma_dc
+		mov			ecx,		[esp + 8]	; pDct
+		
+		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, ecx
+		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, ecx + 0x40
+		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, ecx + 0x100
+		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, ecx + 0x140
+		
+		SSE2_SumSubD		xmm1, xmm2, xmm7
+		SSE2_SumSubD		xmm3, xmm4, xmm7
+		SSE2_SumSubD		xmm2, xmm4, xmm7
+		SSE2_SumSubD		xmm1, xmm3, xmm7	
+
+		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
+	
+		SSE2_SumSubD		xmm4, xmm3, xmm7
+		SSE2_SumSubD		xmm5, xmm1, xmm7
+
+		WELS_DD1 xmm6      
+		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
+		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
+        SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
+
+		packssdw	xmm3,	xmm4
+		packssdw	xmm2,	xmm1
+		movdqa	[eax+ 0],   xmm3
+		movdqa	[eax+16],   xmm2
+		
+		ret	
+
+
--- /dev/null
+++ b/codec/encoder/core/asm/deblock.asm
@@ -1,0 +1,2113 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,68h 
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx] 
+  movq        xmm5,[edx+ecx] 
+  push        esi  
+  push        edi  
+  lea         esi,[edx+edx] 
+  mov         edi,eax 
+  sub         edi,esi 
+  movq        xmm1,[edi] 
+  mov         edi,ecx 
+  sub         edi,esi 
+  movq        xmm2,[edi] 
+  punpcklqdq  xmm1,xmm2 
+  mov         esi,eax 
+  sub         esi,edx 
+  movq        xmm2,[esi] 
+  mov         edi,ecx 
+  sub         edi,edx 
+  movq        xmm3,[edi] 
+  punpcklqdq  xmm2,xmm3 
+  movq        xmm3,[eax] 
+  punpcklqdq  xmm3,xmm4 
+  movq        xmm4,[edx+eax] 
+  mov       edx, [ebp + 14h] 
+  punpcklqdq  xmm4,xmm5 
+  movd        xmm5,edx 
+  mov       edx, [ebp + 18h] 
+  pxor        xmm0,xmm0 
+  movdqa      xmm6,xmm5 
+  punpcklwd   xmm6,xmm5 
+  pshufd      xmm5,xmm6,0 
+  movd        xmm6,edx 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      xmm7,xmm1 
+  punpckhbw   xmm1,xmm0 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+40h],xmm1 
+  movdqa      [esp+60h],xmm7 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+10h],xmm7 
+  movdqa      xmm7,xmm3 
+  punpcklbw   xmm7,xmm0 
+  punpckhbw   xmm3,xmm0 
+  movdqa      [esp+50h],xmm7 
+  movdqa      xmm7,xmm4 
+  punpckhbw   xmm4,xmm0 
+  punpckhbw   xmm2,xmm0 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+30h],xmm3 
+  movdqa      xmm3,[esp+10h] 
+  movdqa      xmm1,xmm3 
+  psubw       xmm1,[esp+50h] 
+  pabsw       xmm1,xmm1 
+  movdqa      [esp+20h],xmm4 
+  movdqa      xmm0,xmm5 
+  pcmpgtw     xmm0,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  pand        xmm0,xmm4 
+  movdqa      xmm1,xmm7 
+  psubw       xmm1,[esp+50h] 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,[esp+30h] 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm5,xmm1 
+  movdqa      xmm1,[esp+40h] 
+  pand        xmm0,xmm4 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  movdqa      xmm1,[esp+20h] 
+  psubw       xmm1,[esp+30h] 
+  pand        xmm5,xmm4 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm6,xmm1 
+  pand        xmm5,xmm6 
+  mov         edx,2 
+  movsx       edx,dx 
+  movd        xmm1,edx 
+  movdqa      xmm4,xmm1 
+  punpcklwd   xmm4,xmm1 
+  pshufd      xmm1,xmm4,0 
+  movdqa      xmm4,[esp+60h] 
+  movdqa      xmm6,xmm4 
+  paddw       xmm6,xmm4 
+  paddw       xmm6,xmm3 
+  paddw       xmm6,xmm7 
+  movdqa      [esp+10h],xmm1 
+  paddw       xmm6,[esp+10h] 
+  psraw       xmm6,2 
+  movdqa      xmm4,xmm0 
+  pandn       xmm4,xmm3 
+  movdqa      xmm3,[esp+40h] 
+  movdqa      xmm1,xmm0 
+  pand        xmm1,xmm6 
+  por         xmm1,xmm4 
+  movdqa      xmm6,xmm3 
+  paddw       xmm6,xmm3 
+  movdqa      xmm3,[esp+10h] 
+  paddw       xmm6,xmm2 
+  paddw       xmm6,[esp+20h] 
+  paddw       xmm6,xmm3 
+  psraw       xmm6,2 
+  movdqa      xmm4,xmm5 
+  pand        xmm4,xmm6 
+  movdqa      xmm6,xmm5 
+  pandn       xmm6,xmm2 
+  por         xmm4,xmm6 
+  packuswb    xmm1,xmm4 
+  movdqa      xmm4,[esp+50h] 
+  movdqa      xmm6,xmm7 
+  paddw       xmm6,xmm7 
+  paddw       xmm6,xmm4 
+  paddw       xmm6,[esp+60h] 
+  paddw       xmm6,xmm3 
+  psraw       xmm6,2 
+  movdqa      xmm2,xmm0 
+  pand        xmm2,xmm6 
+  pandn       xmm0,xmm4 
+  por         xmm2,xmm0 
+  movdqa      xmm0,[esp+20h] 
+  movdqa      xmm6,xmm0 
+  paddw       xmm6,xmm0 
+  movdqa      xmm0,[esp+30h] 
+  paddw       xmm6,xmm0 
+  paddw       xmm6,[esp+40h] 
+  movdqa      xmm4,xmm5 
+  paddw       xmm6,xmm3 
+  movq        [esi],xmm1 
+  psraw       xmm6,2 
+  pand        xmm4,xmm6 
+  pandn       xmm5,xmm0 
+  por         xmm4,xmm5 
+  packuswb    xmm2,xmm4 
+  movq        [eax],xmm2 
+  psrldq      xmm1,8 
+  movq        [edi],xmm1 
+  pop         edi  
+  psrldq      xmm2,8 
+  movq        [ecx],xmm2 
+  pop         esi  
+  mov         esp,ebp 
+  pop         ebp  
+  ret              
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,0E4h 
+  push        ebx  
+  push        esi  
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2] 
+  push        edi  
+  movsx       di,byte [esi+3] 
+  mov         word [esp+0Ch],bx 
+  movsx       bx,byte  [esi+1] 
+  movsx       esi,byte  [esi] 
+  mov         word  [esp+0Eh],si 
+  movzx       esi,di 
+  movd        xmm1,esi 
+  movzx       esi,di 
+  movd        xmm2,esi 
+  mov         si,word  [esp+0Ch] 
+  mov         edx, [ebp + 10h] 
+  mov         eax, [ebp + 08h] 
+  movzx       edi,si 
+  movzx       esi,si 
+  mov         ecx, [ebp + 0Ch] 
+  movd        xmm4,esi 
+  movzx       esi,bx 
+  movd        xmm5,esi 
+  movd        xmm3,edi 
+  movzx       esi,bx 
+  movd        xmm6,esi 
+  mov         si,word [esp+0Eh] 
+  movzx       edi,si 
+  movzx       esi,si 
+  punpcklwd   xmm6,xmm2 
+  pxor        xmm0,xmm0 
+  movdqa      [esp+40h],xmm0 
+  movd        xmm7,edi 
+  movd        xmm0,esi 
+  lea         esi,[edx+edx] 
+  mov         edi,eax 
+  sub         edi,esi 
+  punpcklwd   xmm5,xmm1 
+  movdqa      xmm1,[esp+40h] 
+  punpcklwd   xmm0,xmm4 
+  movq        xmm4,[edx+ecx] 
+  punpcklwd   xmm7,xmm3 
+  movq        xmm3,[eax] 
+  punpcklwd   xmm0,xmm6 
+  movq        xmm6,[edi] 
+  punpcklwd   xmm7,xmm5 
+  punpcklwd   xmm0,xmm7 
+  mov         edi,ecx 
+  sub         edi,esi 
+  movdqa      xmm2,xmm1 
+  psubw       xmm2,xmm0 
+  movdqa      [esp+60h],xmm2 
+  movq        xmm2, [edi] 
+  punpcklqdq  xmm6,xmm2 
+  mov         esi,eax 
+  sub         esi,edx 
+  movq        xmm7,[esi] 
+  mov         edi,ecx 
+  sub         edi,edx 
+  movq        xmm2,[edi] 
+  punpcklqdq  xmm7,xmm2 
+  movq        xmm2,[ecx] 
+  punpcklqdq  xmm3,xmm2 
+  movq        xmm2,[edx+eax] 
+  movsx       edx,word [ebp + 14h] 
+  punpcklqdq  xmm2,xmm4 
+  movdqa      [esp+0E0h],xmm2 
+  movd        xmm2,edx 
+  movsx       edx,word [ebp + 18h] 
+  movdqa      xmm4,xmm2 
+  punpcklwd   xmm4,xmm2 
+  movd        xmm2,edx 
+  movdqa      xmm5,xmm2 
+  punpcklwd   xmm5,xmm2 
+  pshufd      xmm2,xmm5,0 
+  movdqa      [esp+50h],xmm2 
+  movdqa      xmm2,xmm6 
+  punpcklbw   xmm2,xmm1 
+  movdqa      [esp+0D0h],xmm3 
+  pshufd      xmm4,xmm4,0 
+  movdqa      [esp+30h],xmm2 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+80h],xmm6 
+  movdqa      xmm6,[esp+0D0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+70h],xmm6 
+  movdqa      xmm6, [esp+0E0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa     [esp+90h],xmm6 
+  movdqa      xmm5, [esp+0E0h] 
+  movdqa      xmm2,xmm7 
+  punpckhbw   xmm7,xmm1 
+  punpcklbw   xmm5,xmm1 
+  movdqa       [esp+0A0h],xmm7 
+  punpcklbw   xmm3,xmm1 
+  mov         edx,4 
+  punpcklbw   xmm2,xmm1 
+  movsx       edx,dx 
+  movd        xmm6,edx 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      xmm7,[esp+30h] 
+  movdqa      [esp+20h],xmm6 
+  psubw       xmm7,xmm5 
+  movdqa      xmm6,xmm0 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  movdqa      [esp+40h],xmm6 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm2 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm7 
+  paddw       xmm6, [esp+20h] 
+  movdqa      xmm7, [esp+50h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm1,xmm6 
+  movdqa      [esp+10h],xmm0 
+  movdqa      xmm6, [esp+10h] 
+  pminsw      xmm6,xmm1 
+  movdqa      [esp+10h],xmm6 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm6,xmm4 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1, [esp+30h] 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm7,xmm1 
+  movdqa      xmm1,[esp+50h] 
+  pand        xmm6,xmm7 
+  movdqa      xmm7,[esp+50h] 
+  psubw       xmm5,xmm3 
+  pabsw       xmm5,xmm5 
+  pcmpgtw     xmm1,xmm5 
+  movdqa      xmm5,[esp+80h] 
+  psubw       xmm5,[esp+90h] 
+  pand        xmm6,xmm1 
+  pand        xmm6,[esp+40h] 
+  movdqa      xmm1,[esp+10h] 
+  pand        xmm1,xmm6 
+  movdqa      xmm6,[esp+70h] 
+  movdqa      [esp+30h],xmm1 
+  movdqa      xmm1,[esp+0A0h] 
+  psubw       xmm6,xmm1 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm5 
+  paddw       xmm6,[esp+20h] 
+  movdqa      xmm5,[esp+60h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm5,xmm6 
+  pminsw      xmm0,xmm5 
+  movdqa      xmm5,[esp+70h] 
+  movdqa      xmm6,xmm1 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm4,xmm6 
+  movdqa      xmm6,[esp+80h] 
+  psubw       xmm6,xmm1 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+90h] 
+  pand        xmm4,xmm7 
+  movdqa      xmm7,[esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  pand        xmm4,xmm7 
+  pand        xmm4,[esp+40h] 
+  pand        xmm0,xmm4 
+  movdqa      xmm4,[esp+30h] 
+  paddw       xmm2,xmm4 
+  paddw       xmm1,xmm0 
+  packuswb    xmm2,xmm1 
+  movq        [esi],xmm2 
+  psubw       xmm3,xmm4 
+  psubw       xmm5,xmm0 
+  packuswb    xmm3,xmm5 
+  movq        [eax],xmm3 
+  psrldq      xmm2,8 
+  movq        [edi],xmm2 
+  pop         edi  
+  pop         esi  
+  psrldq      xmm3,8 
+  movq        [ecx],xmm3 
+  pop         ebx  
+  mov         esp,ebp 
+  pop         ebp  
+  ret    
+  
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+  
+DeblockChromaEq4H_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,0C8h  
+  mov         ecx,dword [ebp+8] 
+  mov         edx,dword [ebp+0Ch] 
+  mov         eax,dword [ebp+10h] 
+  sub         ecx,2 
+  sub         edx,2 
+  push        esi  
+  lea         esi,[eax+eax*2] 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+4],edx 
+  lea         ecx,[ecx+eax*4] 
+  lea         edx,[edx+eax*4] 
+  lea         eax,[esp+7Ch] 
+  push        edi  
+  mov         dword [esp+14h],esi 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+0Ch],edx 
+  mov         dword [esp+10h],eax 
+  mov         esi,dword [esp+1Ch] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+14h] 
+  movd        xmm0,dword [esi] 
+  movd        xmm1,dword [esi+ecx] 
+  movd        xmm2,dword [esi+ecx*2] 
+  movd        xmm3,dword [esi+edx] 
+  mov         esi,dword  [esp+8] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [esi+ecx] 
+  movd        xmm6,dword [esi+ecx*2] 
+  movd        xmm7,dword [esi+edx] 
+  punpckldq   xmm0,xmm4 
+  punpckldq   xmm1,xmm5 
+  punpckldq   xmm2,xmm6 
+  punpckldq   xmm3,xmm7 
+  mov         esi,dword [esp+18h] 
+  mov         edi,dword [esp+0Ch] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [edi] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm0,xmm4 
+  movd        xmm4,dword [esi+ecx] 
+  movd        xmm5,dword [edi+ecx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm1,xmm4 
+  movd        xmm4,dword [esi+ecx*2] 
+  movd        xmm5,dword [edi+ecx*2] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm2,xmm4 
+  movd        xmm4,dword [esi+edx] 
+  movd        xmm5,dword [edi+edx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm3,xmm4 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         edi,dword [esp+10h] 
+  movdqa      [edi],xmm0 
+  movdqa      [edi+10h],xmm5 
+  movdqa      [edi+20h],xmm1 
+  movdqa      [edi+30h],xmm6 
+  movsx       ecx,word [ebp+14h] 
+  movsx       edx,word [ebp+18h] 
+  movdqa      xmm6,[esp+80h] 
+  movdqa      xmm4,[esp+90h] 
+  movdqa      xmm5,[esp+0A0h] 
+  movdqa      xmm7,[esp+0B0h] 
+  pxor        xmm0,xmm0 
+  movd        xmm1,ecx 
+  movdqa      xmm2,xmm1 
+  punpcklwd   xmm2,xmm1 
+  pshufd      xmm1,xmm2,0 
+  movd        xmm2,edx 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm2,xmm3,0 
+  movdqa      xmm3,xmm6 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+60h],xmm6 
+  movdqa      xmm6,[esp+90h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+30h],xmm6 
+  movdqa      xmm6,[esp+0A0h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+40h],xmm6 
+  movdqa      xmm6,[esp+0B0h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+70h],xmm6 
+  punpcklbw   xmm7,xmm0 
+  punpcklbw   xmm4,xmm0 
+  punpcklbw   xmm5,xmm0 
+  punpcklbw   xmm3,xmm0 
+  movdqa      [esp+50h],xmm7 
+  movdqa      xmm6,xmm4 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  movdqa      xmm0,xmm1 
+  pcmpgtw     xmm0,xmm6 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm4 
+  pabsw       xmm6,xmm6 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pand        xmm0,xmm7 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+30h] 
+  psubw       xmm6,[esp+40h] 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm1,xmm6 
+  movdqa      xmm6,[esp+60h] 
+  psubw       xmm6,[esp+30h] 
+  pabsw       xmm6,xmm6 
+  pand        xmm0,xmm7 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+70h] 
+  psubw       xmm6,[esp+40h] 
+  pabsw       xmm6,xmm6 
+  pand        xmm1,xmm7 
+  pcmpgtw     xmm2,xmm6 
+  pand        xmm1,xmm2 
+  mov         eax,2 
+  movsx       ecx,ax 
+  movd        xmm2,ecx 
+  movdqa      xmm6,xmm2 
+  punpcklwd   xmm6,xmm2 
+  pshufd      xmm2,xmm6,0 
+  movdqa      [esp+20h],xmm2 
+  movdqa      xmm2,xmm3 
+  paddw       xmm2,xmm3 
+  paddw       xmm2,xmm4 
+  paddw       xmm2,[esp+50h] 
+  paddw       xmm2,[esp+20h] 
+  psraw       xmm2,2 
+  movdqa      xmm6,xmm0 
+  pand        xmm6,xmm2 
+  movdqa      xmm2,xmm0 
+  pandn       xmm2,xmm4 
+  por         xmm6,xmm2 
+  movdqa      xmm2,[esp+60h] 
+  movdqa      xmm7,xmm2 
+  paddw       xmm7,xmm2 
+  paddw       xmm7,[esp+30h] 
+  paddw       xmm7,[esp+70h] 
+  paddw       xmm7,[esp+20h] 
+  movdqa      xmm4,xmm1 
+  movdqa      xmm2,xmm1 
+  pandn       xmm2,[esp+30h] 
+  psraw       xmm7,2 
+  pand        xmm4,xmm7 
+  por         xmm4,xmm2 
+  movdqa      xmm2,[esp+50h] 
+  packuswb    xmm6,xmm4 
+  movdqa      [esp+90h],xmm6 
+  movdqa      xmm6,xmm2 
+  paddw       xmm6,xmm2 
+  movdqa      xmm2,[esp+20h] 
+  paddw       xmm6,xmm5 
+  paddw       xmm6,xmm3 
+  movdqa      xmm4,xmm0 
+  pandn       xmm0,xmm5 
+  paddw       xmm6,xmm2 
+  psraw       xmm6,2 
+  pand        xmm4,xmm6 
+  por         xmm4,xmm0 
+  movdqa      xmm0,[esp+70h] 
+  movdqa      xmm5,xmm0 
+  paddw       xmm5,xmm0 
+  movdqa      xmm0,[esp+40h] 
+  paddw       xmm5,xmm0 
+  paddw       xmm5,[esp+60h] 
+  movdqa      xmm3,xmm1 
+  paddw       xmm5,xmm2 
+  psraw       xmm5,2 
+  pand        xmm3,xmm5 
+  pandn       xmm1,xmm0 
+  por         xmm3,xmm1 
+  packuswb    xmm4,xmm3 
+  movdqa      [esp+0A0h],xmm4 
+  mov         esi,dword [esp+10h] 
+  movdqa      xmm0,[esi] 
+  movdqa      xmm1,[esi+10h] 
+  movdqa      xmm2,[esi+20h] 
+  movdqa      xmm3,[esi+30h] 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         esi,dword [esp+1Ch] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+14h] 
+  mov         edi,dword [esp+8] 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         esi,dword [esp+18h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         edi,dword [esp+0Ch] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  pop         edi  
+  pop         esi  
+  mov         esp,ebp 
+  pop         ebp  
+  ret              
+  
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+  
+WELS_EXTERN  DeblockChromaLt4H_sse2
+  
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,108h   
+  mov         ecx,dword [ebp+8] 
+  mov         edx,dword [ebp+0Ch] 
+  mov         eax,dword [ebp+10h] 
+  sub         ecx,2 
+  sub         edx,2 
+  push        esi  
+  lea         esi,[eax+eax*2] 
+  mov         dword [esp+10h],ecx 
+  mov         dword [esp+4],edx 
+  lea         ecx,[ecx+eax*4] 
+  lea         edx,[edx+eax*4] 
+  lea         eax,[esp+6Ch] 
+  push        edi  
+  mov         dword [esp+0Ch],esi 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+10h],edx 
+  mov         dword [esp+1Ch],eax 
+  mov         esi,dword [esp+14h] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+0Ch] 
+  movd        xmm0,dword [esi] 
+  movd        xmm1,dword [esi+ecx] 
+  movd        xmm2,dword [esi+ecx*2] 
+  movd        xmm3,dword [esi+edx] 
+  mov         esi,dword [esp+8] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [esi+ecx] 
+  movd        xmm6,dword [esi+ecx*2] 
+  movd        xmm7,dword [esi+edx] 
+  punpckldq   xmm0,xmm4 
+  punpckldq   xmm1,xmm5 
+  punpckldq   xmm2,xmm6 
+  punpckldq   xmm3,xmm7 
+  mov         esi,dword [esp+18h] 
+  mov         edi,dword [esp+10h] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [edi] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm0,xmm4 
+  movd        xmm4,dword [esi+ecx] 
+  movd        xmm5,dword [edi+ecx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm1,xmm4 
+  movd        xmm4,dword [esi+ecx*2] 
+  movd        xmm5,dword [edi+ecx*2] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm2,xmm4 
+  movd        xmm4,dword [esi+edx] 
+  movd        xmm5,dword [edi+edx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm3,xmm4 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         edi,dword [esp+1Ch] 
+  movdqa      [edi],xmm0 
+  movdqa      [edi+10h],xmm5 
+  movdqa      [edi+20h],xmm1 
+  movdqa      [edi+30h],xmm6 
+  mov         eax,dword [ebp+1Ch] 
+  movsx       cx,byte [eax+3] 
+  movsx       dx,byte [eax+2] 
+  movsx       si,byte [eax+1] 
+  movsx       ax,byte [eax] 
+  movzx       edi,cx 
+  movzx       ecx,cx 
+  movd        xmm2,ecx 
+  movzx       ecx,dx 
+  movzx       edx,dx 
+  movd        xmm3,ecx 
+  movd        xmm4,edx 
+  movzx       ecx,si 
+  movzx       edx,si 
+  movd        xmm5,ecx 
+  pxor        xmm0,xmm0 
+  movd        xmm6,edx 
+  movzx       ecx,ax 
+  movdqa      [esp+60h],xmm0 
+  movzx       edx,ax 
+  movsx       eax,word [ebp+14h] 
+  punpcklwd   xmm6,xmm2 
+  movd        xmm1,edi 
+  movd        xmm7,ecx 
+  movsx       ecx,word [ebp+18h] 
+  movd        xmm0,edx 
+  punpcklwd   xmm7,xmm3 
+  punpcklwd   xmm5,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  punpcklwd   xmm7,xmm5 
+  movdqa      xmm5,[esp+0A0h] 
+  punpcklwd   xmm0,xmm4 
+  punpcklwd   xmm0,xmm6 
+  movdqa      xmm6, [esp+70h] 
+  punpcklwd   xmm0,xmm7 
+  movdqa      xmm7,[esp+80h] 
+  movdqa      xmm2,xmm1 
+  psubw       xmm2,xmm0 
+  movdqa      [esp+0D0h],xmm2 
+  movd        xmm2,eax 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm4,xmm3,0 
+  movd        xmm2,ecx 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm2,xmm3,0 
+  movdqa      xmm3, [esp+90h] 
+  movdqa      [esp+50h],xmm2 
+  movdqa      xmm2,xmm6 
+  punpcklbw   xmm2,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+40h],xmm2 
+  movdqa      [esp+0B0h],xmm6 
+  movdqa      xmm6,[esp+90h] 
+  movdqa      xmm2,xmm7 
+  punpckhbw   xmm7,xmm1 
+  punpckhbw   xmm6,xmm1 
+  punpcklbw   xmm2,xmm1 
+  punpcklbw   xmm3,xmm1 
+  punpcklbw   xmm5,xmm1 
+  movdqa      [esp+0F0h],xmm7 
+  movdqa      [esp+0C0h],xmm6 
+  movdqa      xmm6, [esp+0A0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+0E0h],xmm6 
+  mov         edx,4 
+  movsx       eax,dx 
+  movd        xmm6,eax 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      [esp+30h],xmm6 
+  movdqa      xmm7, [esp+40h] 
+  psubw       xmm7,xmm5 
+  movdqa      xmm6,xmm0 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      [esp+60h],xmm6 
+  movdqa      xmm1, [esp+0D0h] 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm2 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm7 
+  paddw       xmm6,[esp+30h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm1,xmm6 
+  movdqa      xmm7,[esp+50h] 
+  movdqa      [esp+20h],xmm0 
+  movdqa      xmm6, [esp+20h] 
+  pminsw      xmm6,xmm1 
+  movdqa      [esp+20h],xmm6 
+  movdqa      xmm6,xmm4 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1, [esp+40h] 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm7,xmm1 
+  movdqa      xmm1, [esp+50h] 
+  pand        xmm6,xmm7 
+  movdqa      xmm7, [esp+50h] 
+  psubw       xmm5,xmm3 
+  pabsw       xmm5,xmm5 
+  pcmpgtw     xmm1,xmm5 
+  movdqa      xmm5, [esp+0B0h] 
+  psubw       xmm5,[esp+0E0h] 
+  pand        xmm6,xmm1 
+  pand        xmm6, [esp+60h] 
+  movdqa      xmm1, [esp+20h] 
+  pand        xmm1,xmm6 
+  movdqa      xmm6, [esp+0C0h] 
+  movdqa      [esp+40h],xmm1 
+  movdqa      xmm1, [esp+0F0h] 
+  psubw       xmm6,xmm1 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm5 
+  paddw       xmm6, [esp+30h] 
+  movdqa      xmm5, [esp+0D0h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm5,xmm6 
+  pminsw      xmm0,xmm5 
+  movdqa      xmm5,[esp+0C0h] 
+  movdqa      xmm6,xmm1 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm4,xmm6 
+  movdqa      xmm6,[esp+0B0h] 
+  psubw       xmm6,xmm1 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6, [esp+0E0h] 
+  pand        xmm4,xmm7 
+  movdqa      xmm7, [esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  pand        xmm4,xmm7 
+  pand        xmm4,[esp+60h] 
+  pand        xmm0,xmm4 
+  movdqa      xmm4, [esp+40h] 
+  paddw       xmm2,xmm4 
+  paddw       xmm1,xmm0 
+  psubw       xmm3,xmm4 
+  psubw       xmm5,xmm0 
+  packuswb    xmm2,xmm1 
+  packuswb    xmm3,xmm5 
+  movdqa      [esp+80h],xmm2 
+  movdqa      [esp+90h],xmm3 
+  mov         esi,dword [esp+1Ch] 
+  movdqa      xmm0, [esi] 
+  movdqa      xmm1, [esi+10h] 
+  movdqa      xmm2, [esi+20h] 
+  movdqa      xmm3, [esi+30h] 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         esi,dword [esp+14h] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+0Ch] 
+  mov         edi,dword [esp+8] 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         esi,dword [esp+18h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         edi,dword [esp+10h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6  
+  pop         edi  
+  pop         esi   
+  mov         esp,ebp 
+  pop         ebp  
+  ret     
+  
+  
+  
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+  
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+  
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+  
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+  
+    
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push    ebp
+    push    ebx
+    mov     ebp,   esp
+    and     esp,0FFFFFFF0h
+    sub     esp,   10h    
+    
+    mov     eax,   [ebp + 0Ch]  
+    mov     ecx,   [ebp + 10h]
+    lea     edx,   [eax + ecx * 8]
+    lea     ebx,   [ecx*3]
+    
+    movq    xmm0,  [eax] 
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm0,  xmm7  
+    movq    xmm1,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [eax + ecx*2] 
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [eax + ebx]
+    movq    xmm7,  [edx + ebx]
+    punpcklqdq   xmm3,  xmm7
+    
+    lea     eax,   [eax + ecx * 4]
+    lea     edx,   [edx + ecx * 4]
+    movq    xmm4,  [eax] 
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm4,  xmm7  
+    movq    xmm5,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [eax + ecx*2] 
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm6,  xmm7
+    
+    movdqa  [esp],   xmm0
+    movq    xmm7,  [eax + ebx]
+    movq    xmm0,  [edx + ebx]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [esp]
+    
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+    
+    mov    eax,   [ebp + 14h]
+    movdqa  [eax],    xmm4 
+    movdqa  [eax + 10h],  xmm2
+    movdqa  [eax + 20h],  xmm3
+    movdqa  [eax + 30h],  xmm7
+    movdqa  [eax + 40h],  xmm5
+    movdqa  [eax + 50h],  xmm1
+    movdqa  [eax + 60h],  xmm6
+    movdqa  [eax + 70h],  xmm0   
+    
+    mov     esp,   ebp
+    pop     ebx
+    pop     ebp
+    ret
+    
+    
+    
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     ebp
+    mov      ebp,   esp
+    
+    and     esp,  0FFFFFFF0h
+    sub     esp,   10h  
+    
+    mov      eax,   [ebp + 10h]  
+    mov      ecx,   [ebp + 0Ch]
+    mov      edx,   [ebp + 08h]
+      
+    movdqa   xmm0,  [eax]
+    movdqa   xmm1,  [eax + 10h]
+    movdqa   xmm2,  [eax + 20h]
+    movdqa   xmm3,	[eax + 30h]
+    movdqa   xmm4,	[eax + 40h]
+    movdqa   xmm5,	[eax + 50h]
+    movdqa   xmm6,	[eax + 60h]
+    movdqa   xmm7,	[eax + 70h]
+    
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+    
+    lea      eax,   [ecx * 3]
+    
+    movq     [edx],  xmm4 
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+    
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5 
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0    
+    
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+    
+    lea       edx,  [edx + ecx*4]
+    movq     [edx],  xmm4 
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+    
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5 
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0   
+    
+    
+    mov      esp,   ebp
+    pop      ebp
+    ret
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/asm/expand_picture.asm
@@ -1,0 +1,653 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  expand_picture.asm
+;*
+;*  Abstract
+;*      mmxext/sse for expand_frame
+;*
+;*  History
+;*      09/25/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata pData align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+;%define PADDING_SIZE_ASM 	32 	; PADDING_LENGTH
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+
+
+SECTION .text
+
+WELS_EXTERN ExpandPictureLuma_sse2
+WELS_EXTERN ExpandPictureChromaAlign_sse2	; for chroma alignment
+WELS_EXTERN ExpandPictureChromaUnalign_sse2	; for chroma unalignment
+
+;;;;;;;expanding result;;;;;;;
+
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;----------------------------
+;aaaa|attttttttttttttttb|bbbb
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;----------------------------
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+
+%macro mov_line_8x4_mmx		3	; dst, stride, mm?
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
+	movdq%4 [%1], %3 		; top(bottom)_0
+	movdq%4 [%1+%2], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdq%4 [%1], %3 		; top(bottom)_2
+	movdq%4 [%1+%2], %3		; top(bottom)_3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
+	movdq%4 [%1], %3 		; top(bottom)_0
+	movdq%4 [%1+%2], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdq%4 [%1], %3 		; top(bottom)_2
+	movdq%4 [%1+%2], %3		; top(bottom)_3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
+	movdqa [%1], %3 		; top(bottom)_0
+	movdqa [%1+16], %3 		; top(bottom)_0
+	movdqa [%1+%2], %3		; top(bottom)_1
+	movdqa [%1+%2+16], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdqa [%1], %3 		; top(bottom)_2
+	movdqa [%1+16], %3 		; top(bottom)_2
+	movdqa [%1+%2], %3		; top(bottom)_3
+	movdqa [%1+%2+16], %3		; top(bottom)_3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
+	movdqa [%1], %3 		; top(bottom)_0
+	movdqa [%1+16], %3 		; top(bottom)_0
+	movdqa [%1+%2], %3		; top(bottom)_1
+	movdqa [%1+%2+16], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdqa [%1], %3 		; top(bottom)_2
+	movdqa [%1+16], %3 		; top(bottom)_2
+	movdqa [%1+%2], %3		; top(bottom)_3
+	movdqa [%1+%2+16], %3		; top(bottom)_3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+	; ebx [width/16(8)]
+	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
+	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
+		
+%if %1 == 32		; for luma
+	sar ebx, 04h 	; width / 16(8) pixels
+.top_bottom_loops:
+	; top
+	movdqa xmm0, [esi]		; first line of picture pData
+	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+	
+	; bottom
+	movdqa xmm1, [eax] 		; last line of picture pData
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+		
+	lea esi, [esi+16]		; top pSrc
+	lea edi, [edi+16]		; top dst
+	lea eax, [eax+16]		; bottom pSrc
+	lea ebp, [ebp+16]		; bottom dst
+	neg ecx 			; positive/negative stride need for next loop?	
+	
+	dec ebx
+	jnz near .top_bottom_loops		
+%elif %1 == 16	; for chroma ??
+	mov edx, ebx
+	sar ebx, 04h 	; (width / 16) pixels
+.top_bottom_loops:
+	; top
+	movdqa xmm0, [esi]		; first line of picture pData
+	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_16x4_sse2 edi, ecx, xmm0, a
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
+	
+	; bottom
+	movdqa xmm1, [eax] 		; last line of picture pData
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_16x4_sse2 ebp, ecx, xmm1, a
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
+		
+	lea esi, [esi+16]		; top pSrc
+	lea edi, [edi+16]		; top dst
+	lea eax, [eax+16]		; bottom pSrc
+	lea ebp, [ebp+16]		; bottom dst
+	neg ecx 			; positive/negative stride need for next loop?	
+	
+	dec ebx
+	jnz near .top_bottom_loops
+
+	; for remaining 8 bytes
+	and edx, 0fh		; any 8 bytes left?
+	test edx, edx
+	jz near .to_be_continued	; no left to exit here
+
+	; top
+	movq mm0, [esi]		; remained 8 byte
+	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
+	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
+	mov_line_8x4_mmx edi, ecx, mm0	; dst, stride, mm?
+	mov_line_end8x4_mmx edi, ecx, mm0	; dst, stride, mm?
+	; bottom
+	movq mm1, [eax]
+	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
+	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
+	mov_line_8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
+	mov_line_end8x4_mmx ebp, ecx, mm1	; dst, stride, mm?
+	WELSEMMS
+
+.to_be_continued:
+%endif
+%endmacro
+
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+	; ecx [height]
+	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
+	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
+;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
+	
+%if %1 == 32		; for luma	
+.left_right_loops:
+	; left
+	mov al, byte [esi]		; pixel pData for left border
+	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [edi], xmm0
+	movdqa [edi+16], xmm0
+	
+	; right
+	mov al, byte [ebx]
+	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [ebp], xmm1
+	movdqa [ebp+16], xmm1
+	
+	lea esi, [esi+edx]		; left pSrc
+	lea edi, [edi+edx]		; left dst
+	lea ebx, [ebx+edx]		; right pSrc
+	lea ebp, [ebp+edx]		; right dst	
+	
+	dec ecx
+	jnz near .left_right_loops		
+%elif %1 == 16	; for chroma ??	
+.left_right_loops:
+	; left
+	mov al, byte [esi]		; pixel pData for left border
+	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [edi], xmm0	
+	
+	; right
+	mov al, byte [ebx]
+	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
+	
+	lea esi, [esi+edx]		; left pSrc
+	lea edi, [edi+edx]		; left dst
+	lea ebx, [ebx+edx]		; right pSrc
+	lea ebp, [ebp+edx]		; right dst	
+	
+	dec ecx
+	jnz near .left_right_loops
+%endif
+%endmacro
+
+%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
+	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+%if %1 == 32		; luma
+	; TL
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+	mov_line_end32x4_sse2	edi, ecx, xmm3	; dst, stride, xmm?
+
+	; TR
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+	mov_line_end32x4_sse2	ebp, ecx, xmm4	; dst, stride, xmm?
+
+	; BL
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+	mov_line_end32x4_sse2	eax, ecx, xmm5	; dst, stride, xmm?
+
+	; BR
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+	mov_line_end32x4_sse2	ebx, ecx, xmm6	; dst, stride, xmm?
+%elif %1 == 16	; chroma
+	; TL
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
+
+	; TR
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
+
+	; BL
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
+
+	; BR
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
+%endif
+%endmacro
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureLuma_sse2(	uint8_t *pDst,
+;									const int32_t iStride,
+;									const int32_t iWidth,
+;									const int32_t iHeight	);
+;***********************************************************************----------------
+ExpandPictureLuma_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+	
+	; for both top and bottom border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst
+	mov edx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov eax, [esp+36]						; height
+	; also prepare for cross border pData top-left: xmm3
+;	xor ecx, ecx
+	mov cl, byte [esi]
+	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; load top border
+	mov ecx, edx							; stride	
+	neg ecx 								; -stride
+	lea edi, [esi+ecx]						; last line of top border
+	; load bottom border 
+	dec eax									; h-1
+	imul eax, edx 							; (h-1)*stride
+	lea eax, [esi+eax]						; last line of picture pData
+	sal edx, 05h							; 32*stride
+	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
+	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
+	dec ebx									; width-1
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+;	xor edx, edx
+	mov dl, byte [eax]						; bottom-left
+	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	mov dl, byte [ebx]						; bottom-right
+	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for top & bottom expanding	
+	mov ebx, [esp+32]						; width
+	exp_top_bottom_sse2	32	
+	
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst: left border pSrc
+	mov edx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov ecx, [esp+36]						; height
+	; load left border
+	mov eax, -32 							; luma=-32, chroma=-16
+	lea edi, [esi+eax]						; left border dst
+	dec ebx
+	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
+	lea ebp, [ebx+1]						; right border dst	
+	; prepare for cross border pData: top-right with xmm4
+;	xor eax, eax
+	mov al, byte [ebx]						; top-right
+	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for left & right border expanding
+	exp_left_right_sse2	32, a
+	
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst
+	mov ecx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov edx, [esp+36]						; height
+	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+	mov eax, -32							; luma=-32, chroma=-16
+	neg ecx										; -stride
+	lea edi, [esi+eax]						
+	lea edi, [edi+ecx]				; last line of top-left border
+	lea ebp, [esi+ebx]
+	lea ebp, [ebp+ecx]				; last line of top-right border
+	add edx, 32								; height+32(16), luma=32, chroma=16
+	mov ecx, [esp+28]					; stride
+	imul edx, ecx							; (height+32(16)) * stride
+	lea eax, [edi+edx]						; last line of bottom-left border
+	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	neg ecx										; -stride
+	; for left & right border expanding
+	exp_cross_sse2		32, a	
+	
+;	sfence									; commit cache write back memory
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
+;										const int32_t iStride,
+;										const int32_t iWidth,
+;										const int32_t iHeight	);
+;***********************************************************************----------------
+ExpandPictureChromaAlign_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+	
+	; for both top and bottom border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst
+	mov edx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov eax, [esp+36]						; height
+	; also prepare for cross border pData top-left: xmm3
+;	xor ecx, ecx
+	mov cl, byte [esi]
+	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; load top border
+	mov ecx, edx							; stride	
+	neg ecx 								; -stride
+	lea edi, [esi+ecx]						; last line of top border
+	; load bottom border 
+	dec eax									; h-1
+	imul eax, edx 							; (h-1)*stride
+	lea eax, [esi+eax]						; last line of picture pData
+	sal edx, 04h							; 16*stride
+	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
+	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
+	dec ebx									; width-1
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+;	xor edx, edx
+	mov dl, byte [eax]						; bottom-left
+	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	mov dl, byte [ebx]						; bottom-right
+	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for top & bottom expanding	
+	mov ebx, [esp+32]						; width
+	exp_top_bottom_sse2	16	
+	
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst: left border pSrc
+	mov edx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov ecx, [esp+36]						; height
+	; load left border
+	mov eax, -16 							; luma=-32, chroma=-16
+	lea edi, [esi+eax]						; left border dst
+	dec ebx
+	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
+	lea ebp, [ebx+1]						; right border dst	
+	; prepare for cross border pData: top-right with xmm4
+;	xor eax, eax
+	mov al, byte [ebx]						; top-right
+	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for left & right border expanding
+	exp_left_right_sse2	16, a
+	
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst
+	mov ecx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov edx, [esp+36]						; height
+	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+	mov eax, -16							; chroma=-16
+	neg ecx										; -stride
+	lea edi, [esi+eax]						
+	lea edi, [edi+ecx]				; last line of top-left border
+	lea ebp, [esi+ebx]				
+	lea ebp, [ebp+ecx]				; last line of top-right border
+	mov ecx, [esp+28]						; stride
+	add edx, 16							; height+16, luma=32, chroma=16
+	imul edx, ecx							; (height+16) * stride
+	lea eax, [edi+edx]						; last line of bottom-left border
+	lea ebx, [ebp+edx]						; last line of bottom-right border
+	neg ecx										; -stride
+	; for left & right border expanding
+	exp_cross_sse2		16, a
+	
+;	sfence									; commit cache write back memory
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
+;										const int32_t iStride,
+;										const int32_t iWidth,
+;										const int32_t iHeight	);
+;***********************************************************************----------------
+ExpandPictureChromaUnalign_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+	
+	; for both top and bottom border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst
+	mov edx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov eax, [esp+36]						; height
+	; also prepare for cross border pData top-left: xmm3
+;	xor ecx, ecx
+	mov cl, byte [esi]
+	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; load top border
+	mov ecx, edx							; stride	
+	neg ecx 								; -stride
+	lea edi, [esi+ecx]						; last line of top border
+	; load bottom border 
+	dec eax									; h-1
+	imul eax, edx 							; (h-1)*stride
+	lea eax, [esi+eax]						; last line of picture pData
+	sal edx, 04h							; 16*stride
+	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
+	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
+	dec ebx									; width-1
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+;	xor edx, edx
+	mov dl, byte [eax]						; bottom-left
+	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	mov dl, byte [ebx]						; bottom-right
+	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for top & bottom expanding	
+	mov ebx, [esp+32]						; width
+	exp_top_bottom_sse2	16	
+	
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst: left border pSrc
+	mov edx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov ecx, [esp+36]						; height
+	; load left border
+	mov eax, -16 							; luma=-32, chroma=-16
+	lea edi, [esi+eax]						; left border dst
+	dec ebx
+	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
+	lea ebp, [ebx+1]						; right border dst	
+	; prepare for cross border pData: top-right with xmm4
+;	xor eax, eax
+	mov al, byte [ebx]						; top-right
+	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	; for left & right border expanding
+	exp_left_right_sse2	16, u
+	
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	mov esi, [esp+24]						; p_dst
+	mov ecx, [esp+28]						; stride
+	mov ebx, [esp+32]						; width
+	mov edx, [esp+36]						; height
+	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+	neg ecx									; -stride
+	mov eax, -16							; chroma=-16
+	lea edi, [esi+eax]						
+	lea edi, [edi+ecx]				; last line of top-left border
+	lea ebp, [esi+ebx]						
+	lea ebp, [ebp+ecx]				; last line of top-right border
+	mov ecx, [esp+28]						; stride
+	add edx, 16							; height+16, luma=32, chroma=16
+	imul edx, ecx							; (height+16) * stride
+	lea eax, [edi+edx]						; last line of bottom-left border
+	lea ebx, [ebp+edx]						; last line of bottom-right border
+	neg ecx									; -stride
+	; for left & right border expanding
+	exp_cross_sse2		16, u
+	
+;	sfence									; commit cache write back memory
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	
+	ret
+
--- /dev/null
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -1,0 +1,1473 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+align 16
+sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
+
+; for chroma plane mode
+sse2_plane_inc_c dw 1, 2, 3, 4
+sse2_plane_dec_c dw 4, 3, 2, 1
+align 16
+sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
+
+align 16
+mmx_01bytes:		times 16	db 1
+;align 16
+;sse_0x0004bytes:	times 8		dw 4
+;ALIGN 16
+;sse_f000 db  255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+
+align 16
+mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
+
+
+;***********************************************************************
+; macros
+;***********************************************************************
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+;%1 will keep the last result
+%macro SSE_DB_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubb %1, %2
+%endmacro
+
+;xmm0, xmm1, xmm2, eax, ecx
+;lower 64 bits of xmm0 save the result
+%macro SSE2_PRED_H_4X4_TWO_LINE 5
+    movd		%1,	[%4-1]
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+	movdqa		%3,	%1
+	punpcklbw	%1,	%3
+	
+	;add			%4,	%5
+	movd		%2,	[%4+%5-1]
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3
+	movdqa		%3,	%2
+	punpcklbw	%2,	%3	
+	punpckldq	%1,	%2
+%endmacro
+
+%macro  SUMW_HORIZON1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro	LOAD_COLUMN 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,	%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpcklwd %1,	%3
+		lea		%5,	[%5+2*%6]	
+		movd	%4,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %4,	%2
+		lea		%5,	[%5+2*%6]	
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		lea		%5,	[%5+2*%6]
+		punpcklbw %3,	%2
+		punpcklwd %4,	%3
+		punpckhdq %1,	%4	
+%endmacro	
+
+%macro  SUMW_HORIZON 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro	LOAD_COLUMN_C 6
+		movd	%1,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %1,%2
+		lea		%5,	[%5+2*%6]
+		movd	%3,	[%5]
+		movd	%2,	[%5+%6]
+		punpcklbw %3,	%2
+		punpckhwd %1,	%3
+		lea		%5,	[%5+2*%6]			
+%endmacro
+
+%macro LOAD_2_LEFT_AND_ADD 0
+        lea         eax, [eax+2*ecx]
+        movzx		edx, byte [eax-0x01]
+        add			ebx, edx
+        movzx		edx, byte [eax+ecx-0x01]
+        add			ebx, edx
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+WELS_EXTERN WelsI4x4LumaPredH_sse2
+WELS_EXTERN WelsI4x4LumaPredDDR_mmx
+WELS_EXTERN WelsI4x4LumaPredDc_sse2
+WELS_EXTERN WelsI16x16LumaPredPlane_sse2
+
+ALIGN 16
+;***********************************************************************
+;   void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   
+;	pred must align to 16
+;***********************************************************************
+WelsI4x4LumaPredH_sse2:
+	mov			eax,	[esp+8]			;pRef
+	mov			ecx,	[esp+12]		;stride
+
+	movzx		edx,	byte [eax-1]
+	movd		xmm0,	edx
+	pmuludq		xmm0,	[mmx_01bytes]
+	
+	movzx		edx,	byte [eax+ecx-1]
+	movd		xmm1,	edx
+	pmuludq		xmm1,	[mmx_01bytes]
+	
+	unpcklps	xmm0,	xmm1
+
+	lea			eax,	[eax+ecx*2]
+	movzx		edx,	byte [eax-1]
+	movd		xmm2,	edx
+	pmuludq		xmm2,	[mmx_01bytes]
+	
+	movzx		edx,	byte [eax+ecx-1]
+	movd		xmm3,	edx	
+	pmuludq		xmm3,	[mmx_01bytes]
+	
+	unpcklps	xmm2,	xmm3
+	unpcklpd	xmm0,	xmm2
+	
+	mov			edx,	[esp+4]			;pred
+	movdqa		[edx],	xmm0
+	
+	ret
+	
+;***********************************************************************
+; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WelsI16x16LumaPredPlane_sse2:
+%define pushsize	4
+		push	esi
+		mov		esi,	[esp + pushsize + 8]
+		mov		ecx,	[esp + pushsize + 12]
+		sub		esi,	1
+		sub		esi,	ecx
+		
+		;for H
+		pxor	xmm7,	xmm7	
+		movq	xmm0,	[esi]
+		movdqa	xmm5,	[sse2_plane_dec]
+		punpcklbw xmm0,	xmm7
+		pmullw	xmm0,	xmm5
+		movq	xmm1,	[esi + 9]
+		movdqa	xmm6,	[sse2_plane_inc]
+		punpcklbw xmm1,	xmm7
+		pmullw	xmm1,	xmm6
+		psubw	xmm1,	xmm0
+		
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
+		movsx	eax,	ax
+		imul	eax,	5
+		add		eax,	32
+		sar		eax,	6			; b = (5 * H + 32) >> 6;
+		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
+		
+		movzx	edx,	BYTE [esi+16]	
+		sub	esi, 3
+		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
+			
+		add		esi,	3
+		movzx	eax,	BYTE [esi+8*ecx]
+		add		edx,	eax
+		shl		edx,	4			;	a = (left[15*stride] + top[15]) << 4;
+		
+		sub	esi, 3
+		add		esi,	ecx
+		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
+		pxor	xmm4,	xmm4	
+		punpckhbw xmm0,	xmm4
+		pmullw	xmm0,	xmm5
+		punpckhbw xmm7,	xmm4
+		pmullw	xmm7,	xmm6
+		psubw	xmm7,	xmm0
+		
+		SUMW_HORIZON   xmm7,xmm0,xmm2
+		movd    eax,   xmm7			; V
+		movsx	eax,	ax
+
+		imul	eax,	5
+		add		eax,	32
+		sar		eax,	6				; c = (5 * V + 32) >> 6;
+		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c		
+		
+		mov		esi,	[esp + pushsize + 4]
+		add		edx,	16
+		imul	eax,	-7
+		add		edx,	eax				; s = a + 16 + (-7)*c		
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
+		
+		xor		eax,	eax
+		movdqa	xmm5,	[sse2_plane_inc_minus]
+		
+get_i16x16_luma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		movdqa	xmm3,	xmm1
+		pmullw	xmm3,	xmm6
+		paddw	xmm3,	xmm0
+		psraw	xmm3,	5	
+		packuswb xmm2,	xmm3
+		movdqa	[esi],	xmm2
+		paddw	xmm0,	xmm4
+		add		esi,	16
+		inc		eax
+		cmp		eax,	16
+		jnz get_i16x16_luma_pred_plane_sse2_1					
+		
+		pop		esi
+		ret
+		
+		
+		
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE 1
+    lea     eax,	[eax+ecx*2]
+    
+    COPY_16_TIMES	eax,	xmm0
+    movdqa			[edx+%1],	xmm0
+   COPY_16_TIMESS eax,	xmm0,	ecx
+   ; COPY_16_TIMES	eax + ecx,	xmm0
+    movdqa  [edx+%1+0x10],	xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+    
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx],		xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+0x10],	xmm0
+    
+	SSE2_PRED_H_16X16_TWO_LINE   0x20 
+	SSE2_PRED_H_16X16_TWO_LINE   0x40
+	SSE2_PRED_H_16X16_TWO_LINE   0x60
+	SSE2_PRED_H_16X16_TWO_LINE   0x80
+	SSE2_PRED_H_16X16_TWO_LINE   0xa0
+	SSE2_PRED_H_16X16_TWO_LINE   0xc0
+	SSE2_PRED_H_16X16_TWO_LINE   0xe0
+   
+    ret
+    
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+    
+    sub     eax, ecx
+    movdqa  xmm0, [eax]
+    
+    movdqa  [edx], xmm0
+    movdqa  [edx+10h], xmm0
+    movdqa  [edx+20h], xmm0
+    movdqa  [edx+30h], xmm0
+    movdqa  [edx+40h], xmm0
+    movdqa  [edx+50h], xmm0
+    movdqa  [edx+60h], xmm0
+    movdqa  [edx+70h], xmm0
+    movdqa  [edx+80h], xmm0
+    movdqa  [edx+90h], xmm0
+    movdqa  [edx+160], xmm0 
+	movdqa  [edx+176], xmm0
+    movdqa  [edx+192], xmm0
+    movdqa  [edx+208], xmm0
+    movdqa  [edx+224], xmm0
+    movdqa  [edx+240], xmm0
+    
+    ret
+    
+;***********************************************************************
+; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredPlane_sse2
+WelsIChromaPredPlane_sse2:
+%define pushsize	4
+		push	esi
+		mov		esi,	[esp + pushsize + 8]	;pRef
+		mov		ecx,	[esp + pushsize + 12]	;stride
+		sub		esi,	1
+		sub		esi,	ecx
+		
+		pxor	mm7,	mm7	
+		movq	mm0,	[esi]
+		movq	mm5,	[sse2_plane_dec_c]
+		punpcklbw mm0,	mm7
+		pmullw	mm0,	mm5
+		movq	mm1,	[esi + 5]
+		movq	mm6,	[sse2_plane_inc_c]
+		punpcklbw mm1,	mm7
+		pmullw	mm1,	mm6
+		psubw	mm1,	mm0
+		
+		movq2dq xmm1,   mm1
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm1,xmm0,xmm2
+		movd    eax,	xmm1
+		movsx	eax,	ax
+		imul	eax,	17
+		add		eax,	16
+		sar		eax,	5			; b = (17 * H + 16) >> 5;
+		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
+		
+		movzx	edx,	BYTE [esi+8]
+		sub	esi, 3
+		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
+
+		add		esi,	3
+		movzx	eax,	BYTE [esi+4*ecx]
+		add		edx,	eax
+		shl		edx,	4			; a = (left[7*stride] + top[7]) << 4;
+		
+		sub	esi, 3
+		add		esi,	ecx
+		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
+		pxor	mm4,	mm4	
+		punpckhbw mm0,	mm4
+		pmullw	mm0,	mm5
+		punpckhbw mm7,	mm4
+		pmullw	mm7,	mm6
+		psubw	mm7,	mm0
+		
+		movq2dq xmm7,   mm7
+		pxor    xmm2,   xmm2
+		SUMW_HORIZON	xmm7,xmm0,xmm2
+		movd    eax,    xmm7			; V
+		movsx	eax,	ax
+
+		imul	eax,	17
+		add		eax,	16
+		sar		eax,	5				; c = (17 * V + 16) >> 5;
+		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c		
+		
+		mov		esi,	[esp + pushsize + 4]
+		add		edx,	16
+		imul	eax,	-3
+		add		edx,	eax				; s = a + 16 + (-3)*c		
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
+		
+		xor		eax,	eax
+		movdqa	xmm5,	[sse2_plane_mul_b_c]
+		
+get_i_chroma_pred_plane_sse2_1:
+		movdqa	xmm2,	xmm1
+		pmullw	xmm2,	xmm5
+		paddw	xmm2,	xmm0
+		psraw	xmm2,	5
+		packuswb xmm2,	xmm2
+		movq	[esi],	xmm2
+		paddw	xmm0,	xmm4
+		add		esi,	8
+		inc		eax
+		cmp		eax,	8
+		jnz get_i_chroma_pred_plane_sse2_1					
+		
+		pop		esi
+		WELSEMMS
+		ret	
+		
+ALIGN 16
+;***********************************************************************
+;	0 |1 |2 |3 |4 |
+;	6 |7 |8 |9 |10|
+;	11|12|13|14|15|
+;	16|17|18|19|20|
+;	21|22|23|24|25|
+;	7 is the start pixel of current 4x4 block
+;	pred[7] = ([6]+[0]*2+[1]+2)/4
+;
+;   void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;   
+;***********************************************************************
+WelsI4x4LumaPredDDR_mmx:	
+	mov			edx,[esp+4]			;pred
+	mov         eax,[esp+8]			;pRef
+	mov			ecx,[esp+12]		;stride
+	
+	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
+	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
+	punpckhbw   mm2,[eax-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+	movd        mm3,[eax]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+	psllq       mm3,18h				;mm3[5]=[1]
+	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+	lea			eax,[eax+ecx*2-8h]		;set eax point to 12
+	movq        mm4,[eax+ecx]		;get value of 16, mm4[8]=[16]
+	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[16]
+	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+	movq        mm4,[eax+ecx*2]		;mm4[8]=[21]
+	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+	psrlq       mm4,38h				;mm4[1]=[21]
+	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
+	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
+	pand        mm1,[mmx_01bytes]	;set the odd bit
+	psubusb     mm3,mm1				;decrease 1 from odd bytes
+	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+	
+	movd        [edx+12],mm2 
+	psrlq       mm2,8 
+	movd        [edx+8],mm2 
+	psrlq       mm2,8 
+	movd        [edx+4],mm2 
+	psrlq       mm2,8 
+	movd        [edx],mm2
+	WELSEMMS
+	ret
+	
+ALIGN 16
+;***********************************************************************
+;	0 |1 |2 |3 |4 |
+;	5 |6 |7 |8 |9 |
+;	10|11|12|13|14|
+;	15|16|17|18|19|
+;	20|21|22|23|24|
+;	6 is the start pixel of current 4x4 block
+;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+;
+;   void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;   
+;***********************************************************************
+WelsI4x4LumaPredDc_sse2:	
+	mov         eax,[esp+8]			;pRef
+	mov			ecx,[esp+12]		;stride
+	push		ebx
+		
+	movzx		edx,	byte [eax-1h]
+	
+	sub			eax,	ecx
+	movd		xmm0,	[eax]
+	pxor		xmm1,	xmm1
+	psadbw		xmm0,	xmm1
+	
+	movd		ebx,	xmm0
+	add			ebx,	edx
+	
+	movzx		edx,	byte [eax+ecx*2-1h]
+	add			ebx,	edx
+	
+	lea			eax,	[eax+ecx*2-1]
+	movzx		edx,	byte [eax+ecx]
+	add			ebx,	edx
+	
+	movzx		edx,	byte [eax+ecx*2]
+	add			ebx,	edx
+	add			ebx,	4
+	sar			ebx,	3
+	imul		ebx,	0x01010101
+	
+	mov			edx,	[esp+8]			;pred
+	movd		xmm0,	ebx
+	pshufd		xmm0,	xmm0,	0
+	movdqa		[edx],	xmm0
+				
+	pop ebx
+	ret	
+	
+ALIGN 16
+;***********************************************************************
+;	void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy 8 pixel of 8 line from left
+;***********************************************************************
+%macro MMX_PRED_H_8X8_ONE_LINE 4
+	movq		%1,		[%3-8]
+	psrlq		%1,		38h
+	
+	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+%macro MMX_PRED_H_8X8_ONE_LINEE 4
+	movq		%1,		[%3+ecx-8]
+	psrlq		%1,		38h
+	
+	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
+	pmullw		%1,		[mmx_01bytes]
+	pshufw		%1,		%1,	0
+	movq		[%4],	%1
+%endmacro
+
+WELS_EXTERN WelsIChromaPredH_mmx
+WelsIChromaPredH_mmx:
+	mov			edx,	[esp+4]			;pred
+	mov         eax,	[esp+8]			;pRef
+	mov			ecx,	[esp+12]		;stride
+	
+	movq		mm0,	[eax-8]
+	psrlq		mm0,	38h
+	
+	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
+	pmullw		mm0,		[mmx_01bytes]
+	pshufw		mm0,	mm0,	0
+	movq		[edx],	mm0
+	
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+8
+	
+	lea			eax,[eax+ecx*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+16
+	
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+24
+	
+	lea			eax,[eax+ecx*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+32
+	
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+40
+	
+	lea			eax,[eax+ecx*2]
+	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+48
+
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+56		
+	WELSEMMS
+	ret	
+	
+ALIGN 16
+;***********************************************************************
+;	void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy pixels from top 4 pixels
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredV_sse2
+WelsI4x4LumaPredV_sse2:
+	mov			edx,	[esp+4]			;pred
+	mov         eax,	[esp+8]			;pRef
+	mov			ecx,	[esp+12]		;stride
+	
+	sub			eax,	ecx
+	movd		xmm0,	[eax]
+	pshufd		xmm0,	xmm0,	0
+	movdqa		[edx],	xmm0
+	ret	
+
+ALIGN 16
+;***********************************************************************
+;	void __cdecl WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   copy 8 pixels from top 8 pixels
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredV_sse2
+WelsIChromaPredV_sse2:
+	mov			edx,		[esp+4]			;pred
+	mov         eax,		[esp+8]			;pRef
+	mov			ecx,		[esp+12]		;stride
+	
+	sub			eax,		ecx
+	movq		xmm0,		[eax]
+	movdqa		xmm1,		xmm0
+	punpcklqdq	xmm0,		xmm1
+
+	movdqa		[edx],		xmm0
+	movdqa		[edx+16],	xmm0
+	movdqa		[edx+32],	xmm0
+	movdqa		[edx+48],	xmm0
+	ret
+	
+	
+	ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |a |b |
+;	|g |h |e |f |
+;	|i |j |g |h |
+
+;   a = (1 + lt + l0)>>1
+;   e = (1 + l0 + l1)>>1
+;   g = (1 + l1 + l2)>>1
+;   i = (1 + l2 + l3)>>1
+
+;   d = (2 + t0 + (t1<<1) + t2)>>2
+;   c = (2 + lt + (t0<<1) + t1)>>2
+;   b = (2 + l0 + (lt<<1) + t0)>>2
+
+;   f = (2 + l1 + (l0<<1) + lt)>>2
+;   h = (2 + l2 + (l1<<1) + l0)>>2
+;   j = (2 + l3 + (l2<<1) + l1)>>2   
+;   [b a f e h g j i] + [d c b a] --> mov to memory
+;   
+;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHD_mmx
+WelsI4x4LumaPredHD_mmx:	
+	mov			edx, [esp+4]			; pred
+	mov         eax, [esp+8]			; pRef
+	mov			ecx, [esp+12]           ; stride
+	sub         eax, ecx
+	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+	
+	movd        mm1, [eax+2*ecx-4]        
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+	lea         eax, [eax+2*ecx]
+	movd        mm2, [eax+2*ecx-4]        
+	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+	psrlq       mm2, 20h
+	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+	
+	movq        mm1, mm0
+	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+	movq        mm2, mm0
+	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+	movq        mm3, mm2
+	movq        mm4, mm1
+	pavgb       mm1, mm0
+	
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm4				; decrease 1 from odd bytes
+	
+	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+	
+	movq        mm4, mm0
+	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+	
+	psrlq       mm2, 20h
+	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+	movq        mm4, mm3
+	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+	
+	movd        [edx], mm2
+	movd        [edx+12], mm3
+	psrlq       mm3, 10h
+	movd        [edx+8], mm3
+	psrlq       mm3, 10h
+	movd        [edx+4], mm3
+	WELSEMMS
+	ret
+	
+	
+	
+ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|c |d |e |f |
+;	|e |f |g |g |
+;	|g |g |g |g |
+
+;   a = (1 + l0 + l1)>>1
+;   c = (1 + l1 + l2)>>1
+;   e = (1 + l2 + l3)>>1
+;   g = l3
+
+;   b = (2 + l0 + (l1<<1) + l2)>>2
+;   d = (2 + l1 + (l2<<1) + l3)>>2
+;   f = (2 + l2 + (l3<<1) + l3)>>2
+ 
+;   [g g f e d c b a] + [g g g g] --> mov to memory
+;   
+;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHU_mmx
+WelsI4x4LumaPredHU_mmx:	
+	mov			edx, [esp+4]			; pred
+	mov         eax, [esp+8]			; pRef
+	mov			ecx, [esp+12]           ; stride
+	
+	movd        mm0, [eax-4]            ; mm0[3] = l0
+	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
+	lea         eax, [eax+2*ecx]
+	movd        mm2, [eax-4]            ; mm2[3] = l2
+	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
+	punpcklbw   mm2, mm4
+	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+	
+	psrlq       mm4, 18h
+	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+	psrlq       mm0, 8h
+	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+	
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+	
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+	movq        mm5, mm2
+	pavgb       mm2, mm0
+	
+	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
+	pand        mm5, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm5				; decrease 1 from odd bytes
+	
+	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+	
+	psrlq       mm2, 8h
+	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+	
+	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+	
+	psrlq       mm4, 20h
+	movd        [edx+12], mm4
+	
+	movd        [edx], mm1
+	psrlq       mm1, 10h
+	movd        [edx+4], mm1
+	psrlq       mm1, 10h
+	movd        [edx+8], mm1
+	WELSEMMS
+	ret
+	
+	
+	
+ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	l3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|i |a |b |c |
+;	|j |e |f |g |
+
+;   a = (1 + lt + t0)>>1
+;   b = (1 + t0 + t1)>>1
+;   c = (1 + t1 + t2)>>1
+;   d = (1 + t2 + t3)>>1
+
+;   e = (2 + l0 + (lt<<1) + t0)>>2
+;   f = (2 + lt + (t0<<1) + t1)>>2
+;   g = (2 + t0 + (t1<<1) + t2)>>2
+
+;   h = (2 + t1 + (t2<<1) + t3)>>2
+;   i = (2 + lt + (l0<<1) + l1)>>2
+;   j = (2 + l0 + (l1<<1) + l2)>>2   
+;   
+;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVR_mmx
+WelsI4x4LumaPredVR_mmx:	
+	mov			edx, [esp+4]			; pred
+	mov         eax, [esp+8]			; pRef
+	mov			ecx, [esp+12]           ; stride
+	sub         eax, ecx
+	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+	
+	movd        mm1, [eax+2*ecx-4]        
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+	lea         eax, [eax+2*ecx]
+	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
+	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+	psrlq       mm2, 28h
+	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+	
+	movq        mm1, mm0
+	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+	
+	movq        mm2, mm0
+	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+	movq        mm3, mm2
+	pavgb       mm2, mm0
+	
+	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm3				; decrease 1 from odd bytes
+	
+	movq        mm3, mm0
+	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+	movq        mm2, mm3
+	
+	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+	movd        [edx], mm1
+	
+	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+	movd        [edx+4], mm2
+	
+	movq        mm4, mm3
+	psllq       mm4, 20h
+	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+	
+	movq        mm5, mm3
+	psllq       mm5, 28h
+	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+	
+	psllq       mm1, 8h
+	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+	movd        [edx+8], mm4
+	
+	psllq       mm2, 8h
+	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+	movd        [edx+12], mm5
+	WELSEMMS
+	ret
+	
+ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|b |c |d |e |
+;	|c |d |e |f |
+;	|d |e |f |g |
+
+;   a = (2 + t0 + t2 + (t1<<1))>>2
+;   b = (2 + t1 + t3 + (t2<<1))>>2
+;   c = (2 + t2 + t4 + (t3<<1))>>2
+;   d = (2 + t3 + t5 + (t4<<1))>>2
+
+;   e = (2 + t4 + t6 + (t5<<1))>>2
+;   f = (2 + t5 + t7 + (t6<<1))>>2
+;   g = (2 + t6 + t7 + (t7<<1))>>2
+ 
+;   [g f e d c b a] --> mov to memory
+;   
+;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDDL_mmx
+WelsI4x4LumaPredDDL_mmx:	
+	mov			edx, [esp+4]			; pred
+	mov         eax, [esp+8]			; pRef
+	mov			ecx, [esp+12]           ; stride
+	sub         eax, ecx
+	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+	
+	movq        mm3, mm0
+	psrlq       mm3, 38h
+	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+	
+	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+	psrlq       mm2, 8h
+	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+
+	movq        mm3, mm1
+	pavgb       mm1, mm2
+	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
+	pand        mm3, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm1, mm3				; decrease 1 from odd bytes
+	
+	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+	
+	psrlq       mm0, 8h
+	movd        [edx], mm0
+	psrlq       mm0, 8h
+	movd        [edx+4], mm0
+	psrlq       mm0, 8h
+	movd        [edx+8], mm0
+	psrlq       mm0, 8h
+	movd        [edx+12], mm0
+	WELSEMMS
+	ret
+	
+	
+ALIGN 16
+;***********************************************************************
+;	lt|t0|t1|t2|t3|t4|t5|t6|t7
+;	l0|
+;	l1|
+;	l2|
+;	l3|
+;	lt,t0,t1,t2,t3 will never been used
+;   destination:
+;	|a |b |c |d |
+;	|e |f |g |h |
+;	|b |c |d |i |
+;	|f |g |h |j |
+
+;   a = (1 + t0 + t1)>>1
+;   b = (1 + t1 + t2)>>1
+;   c = (1 + t2 + t3)>>1
+;   d = (1 + t3 + t4)>>1
+;   i = (1 + t4 + t5)>>1
+
+;   e = (2 + t0 + (t1<<1) + t2)>>2
+;   f = (2 + t1 + (t2<<1) + t3)>>2
+;   g = (2 + t2 + (t3<<1) + t4)>>2
+;   h = (2 + t3 + (t4<<1) + t5)>>2
+;   j = (2 + t4 + (t5<<1) + t6)>>2
+ 
+;   [i d c b a] + [j h g f e] --> mov to memory
+;   
+;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVL_mmx
+WelsI4x4LumaPredVL_mmx:	
+	mov			edx, [esp+4]			; pred
+	mov         eax, [esp+8]			; pRef
+	mov			ecx, [esp+12]           ; stride
+	
+	sub         eax, ecx
+	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+	movq        mm1, mm0
+	movq        mm2, mm0
+	
+	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+
+	movq        mm3, mm1
+	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+	
+	movq        mm4, mm2
+	pavgb       mm2, mm0	
+	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
+	pand        mm4, [mmx_01bytes]	    ; set the odd bit
+	psubusb     mm2, mm4				; decrease 1 from odd bytes
+	
+	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+	
+	movd        [edx], mm3
+	psrlq       mm3, 8h
+	movd        [edx+8], mm3
+	
+	movd        [edx+4], mm2
+	psrlq       mm2, 8h
+	movd        [edx+12], mm2
+	WELSEMMS
+	ret
+	
+ALIGN 16
+;***********************************************************************
+;
+;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredDc_sse2
+WelsIChromaPredDc_sse2:	
+	push        ebx
+	mov         eax, [esp+12]			; pRef
+	mov			ecx, [esp+16]           ; stride
+	
+	sub         eax, ecx
+	movq        mm0, [eax]
+	
+	;xor         ebx, ebx
+	;movzx		edx, byte [eax+ecx-0x01] ; l1
+	movzx		ebx, byte [eax+ecx-0x01] ; l1
+	;mov			ebx, edx
+	lea         eax, [eax+2*ecx]
+	movzx		edx, byte [eax-0x01]     ; l2
+	add			ebx, edx
+	movzx		edx, byte [eax+ecx-0x01] ; l3
+	add			ebx, edx
+	lea         eax, [eax+2*ecx]
+	movzx		edx, byte [eax-0x01]     ; l4
+	add			ebx, edx
+	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
+	
+	;xor         ebx, ebx
+	;movzx		edx, byte [eax+ecx-0x01] ; l5
+	movzx		ebx, byte [eax+ecx-0x01] ; l5
+	;mov		ebx, edx
+	lea         eax, [eax+2*ecx]
+	movzx		edx, byte [eax-0x01]     ; l6
+	add			ebx, edx
+	movzx		edx, byte [eax+ecx-0x01] ; l7
+	add			ebx, edx
+	lea         eax, [eax+2*ecx]
+	movzx		edx, byte [eax-0x01]     ; l8
+	add			ebx, edx
+	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
+	
+	movq        mm3, mm0
+	psrlq       mm0, 0x20
+	psllq       mm3, 0x20
+	psrlq       mm3, 0x20
+	pxor		mm4, mm4
+	psadbw		mm0, mm4
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2	
+	
+	paddq       mm3, mm1
+	movq        mm1, mm2
+	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+	
+	movq        mm4, [mmx_0x02]
+	
+	paddq       mm0, mm4
+	psrlq       mm0, 0x02
+	
+	paddq       mm2, mm4
+	psrlq       mm2, 0x02
+	
+	paddq       mm3, mm4
+	paddq       mm3, mm4
+	psrlq       mm3, 0x03
+	
+	paddq       mm1, mm4
+	paddq       mm1, mm4
+	psrlq       mm1, 0x03
+	
+	pmuludq     mm0, [mmx_01bytes]
+	pmuludq     mm3, [mmx_01bytes]
+	psllq       mm0, 0x20
+	pxor        mm0, mm3                 ; mm0 = m_up
+	
+	pmuludq     mm2, [mmx_01bytes]
+	pmuludq     mm1, [mmx_01bytes]
+	psllq       mm1, 0x20
+	pxor        mm1, mm2                 ; mm2 = m_down
+	
+	mov         edx, [esp+8]			 ; pRef
+	
+	movq        [edx], mm0
+	movq        [edx+0x08], mm0
+	movq        [edx+0x10], mm0
+	movq        [edx+0x18], mm0
+	
+	movq        [edx+0x20], mm1
+	movq        [edx+0x28], mm1
+	movq        [edx+0x30], mm1
+	movq        [edx+0x38], mm1
+	
+	pop         ebx
+	WELSEMMS
+	ret
+	
+	
+	
+ALIGN 16
+;***********************************************************************
+;
+;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredDc_sse2
+WelsI16x16LumaPredDc_sse2:	
+	push        ebx
+	mov         eax, [esp+12]			; pRef
+	mov			ecx, [esp+16]           ; stride
+	
+	sub         eax, ecx
+	movdqa      xmm0, [eax]             ; read one row
+	pxor		xmm1, xmm1
+	psadbw		xmm0, xmm1
+	movdqa      xmm1, xmm0
+	psrldq      xmm1, 0x08
+	pslldq      xmm0, 0x08
+	psrldq      xmm0, 0x08
+	paddw       xmm0, xmm1
+	
+	;xor         ebx, ebx
+	;movzx		edx, byte [eax+ecx-0x01]
+	movzx		ebx, byte [eax+ecx-0x01]
+	;mov			ebx, edx
+	movzx		edx, byte [eax+2*ecx-0x01]
+	add			ebx, edx
+	lea         eax, [eax+ecx]
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	LOAD_2_LEFT_AND_ADD
+	add         ebx, 0x10
+	movd        xmm1, ebx
+	paddw       xmm0, xmm1
+	psrld       xmm0, 0x05
+	pmuludq     xmm0, [mmx_01bytes]
+	pshufd      xmm0, xmm0, 0
+	
+	mov         edx, [esp+8]			; pred
+	movdqa      [edx], xmm0
+	movdqa      [edx+0x10], xmm0
+	movdqa      [edx+0x20], xmm0
+	movdqa      [edx+0x30], xmm0
+	movdqa      [edx+0x40], xmm0
+	movdqa      [edx+0x50], xmm0
+	movdqa      [edx+0x60], xmm0
+	movdqa      [edx+0x70], xmm0
+	movdqa      [edx+0x80], xmm0
+	movdqa      [edx+0x90], xmm0
+	movdqa      [edx+0xa0], xmm0
+	movdqa      [edx+0xb0], xmm0
+	movdqa      [edx+0xc0], xmm0
+	movdqa      [edx+0xd0], xmm0
+	movdqa      [edx+0xe0], xmm0
+	movdqa      [edx+0xf0], xmm0
+	
+	pop         ebx
+
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, 
+;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
+;
+;***********************************************************************
+WELS_EXTERN WelsSmpleSatdThree4x4_sse2
+align 16
+WelsSmpleSatdThree4x4_sse2:
+	push      ebx
+	push      esi
+	push      edi
+	mov       eax,  [esp+24];p_enc
+	mov       ebx,  [esp+28];linesize_enc
+	
+	; load source 4x4 samples and Hadamard transform
+    movd      xmm0, [eax]
+    movd      xmm1, [eax+ebx]
+    lea       eax , [eax+2*ebx]
+    movd      xmm2, [eax]
+    movd      xmm3, [eax+ebx]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+       
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+    
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp  qdq, xmm0, xmm2, xmm3
+    
+    movdqa    xmm4, xmm0
+    paddw     xmm0, xmm3
+    psubw     xmm4, xmm3
+
+    movdqa    xmm2, xmm0
+    punpcklwd xmm0, xmm4
+    punpckhwd xmm4, xmm2
+    
+	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
+
+    movdqa    xmm7, xmm0
+    paddw     xmm0, xmm5
+    psubw     xmm7, xmm5
+    
+	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
+    
+    ; Hadamard transform results are saved in xmm0 and xmm2
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+  	
+	; load top boundary samples: [a b c d]
+    mov       eax,  [esp+16];p_dec
+	sub		  eax,	[esp+20];linesize_dec
+	movzx     ecx,  byte [eax]
+	movzx     edx,  byte [eax+1]
+	movzx     esi,  byte [eax+2]
+	movzx     edi,  byte [eax+3]
+	
+	; get the transform results of top boundary samples: [a b c d]
+	add       edx, ecx ; edx = a + b
+	add       edi, esi ; edi = c + d
+	add       ecx, ecx ; ecx = a + a
+	add       esi, esi ; esi = c + c
+	sub       ecx, edx ; ecx = a + a - a - b = a - b
+	sub       esi, edi ; esi = c + c - c - d = c - d
+	add       edi, edx ; edi = (a + b) + (c + d)
+	add       edx, edx
+	sub       edx, edi ; edx = (a + b) - (c + d)
+	add       esi, ecx ; esi = (a - b) + (c - d)
+	add       ecx, ecx
+	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
+	
+	movdqa    xmm6, xmm0
+	movdqa    xmm7, xmm2
+	movd      xmm5, edi ; store the edi for DC mode
+	pxor      xmm3, xmm3
+	pxor      xmm4, xmm4
+	pinsrw    xmm3, edi, 0
+	pinsrw    xmm3, esi, 4
+	psllw     xmm3, 2
+	pinsrw    xmm4, edx, 0
+	pinsrw    xmm4, ecx, 4
+	psllw     xmm4, 2
+	
+	; get the satd of H
+	psubw     xmm0, xmm3
+	psubw     xmm2, xmm4
+	
+	WELS_AbsW  xmm0, xmm1
+	WELS_AbsW  xmm2, xmm1
+    paddusw        xmm0, xmm2
+    SUMW_HORIZON1  xmm0, xmm1 ; satd of V is stored in xmm0
+	
+	; load left boundary samples: [a b c d]'
+    mov       eax,  [esp+16]
+	mov       ebx,  [esp+20]
+	movzx     ecx,  byte [eax-1]
+	movzx     edx,  byte [eax+ebx-1]
+	lea       eax , [eax+2*ebx]
+	movzx     esi,  byte [eax-1]
+	movzx     edi,  byte [eax+ebx-1]
+	
+	; get the transform results of left boundary samples: [a b c d]'
+	add       edx, ecx ; edx = a + b
+	add       edi, esi ; edi = c + d
+	add       ecx, ecx ; ecx = a + a
+	add       esi, esi ; esi = c + c
+	sub       ecx, edx ; ecx = a + a - a - b = a - b
+	sub       esi, edi ; esi = c + c - c - d = c - d
+	add       edi, edx ; edi = (a + b) + (c + d)
+	add       edx, edx
+	sub       edx, edi ; edx = (a + b) - (c + d)
+	add       esi, ecx ; esi = (a - b) + (c - d)
+	add       ecx, ecx
+	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
+	
+	; store the transform results in xmm3	
+    movd      xmm3, edi
+	pinsrw    xmm3, edx, 1
+	pinsrw    xmm3, ecx, 2
+	pinsrw    xmm3, esi, 3
+	psllw     xmm3, 2
+	
+	; get the satd of V
+	movdqa    xmm2, xmm6
+	movdqa    xmm4, xmm7
+	psubw     xmm2, xmm3
+	WELS_AbsW  xmm2, xmm1
+	WELS_AbsW  xmm4, xmm1
+    paddusw        xmm2, xmm4
+    SUMW_HORIZON1  xmm2, xmm1 ; satd of H is stored in xmm2
+
+	; DC result is stored in xmm1
+	add       edi, 4
+	movd      xmm1, edi
+	paddw     xmm1, xmm5
+	psrlw     xmm1, 3
+	movdqa    xmm5, xmm1
+	psllw     xmm1, 4
+	
+    ; get the satd of DC
+    psubw          xmm6, xmm1
+    WELS_AbsW  xmm6, xmm1
+	WELS_AbsW  xmm7, xmm1
+    paddusw        xmm6, xmm7
+    SUMW_HORIZON1  xmm6, xmm1 ; satd of DC is stored in xmm6
+    
+    ; comparing order: DC H V
+    mov       edx, [esp+32]
+    movd      eax, xmm6
+    movd      edi, xmm2
+    movd      esi, xmm0
+    and       eax, 0xffff
+    shr       eax, 1
+    and       edi, 0xffff
+    shr       edi, 1
+    and       esi, 0xffff
+    shr       esi, 1
+    add       eax, [esp+40]
+    add       edi, [esp+44]
+    add       esi, [esp+48]
+    cmp       ax, di
+    jg near   not_dc
+    cmp       ax, si
+    jg near   not_dc_h
+    
+    ; for DC mode
+    movd      ebx, xmm5 
+    imul      ebx, 0x01010101
+    movd	  xmm5, ebx
+	pshufd    xmm5, xmm5, 0
+	movdqa    [edx], xmm5
+	mov       ebx, [esp+36]
+	mov       dword [ebx], 0x02
+	pop       edi
+    pop       esi
+    pop       ebx
+    ret
+    
+not_dc:
+    cmp       di, si
+    jg near   not_dc_h
+    
+    ; for H mode
+    SSE_DB_1_2REG  xmm6, xmm7
+    mov       eax,  [esp+16]
+	mov       ebx,  [esp+20]
+    movzx     ecx,  byte [eax-1]
+	movd      xmm0, ecx
+    pmuludq   xmm0, xmm6
+
+	movzx     ecx,  byte [eax+ebx-1]
+	movd      xmm1, ecx
+    pmuludq   xmm1, xmm6 
+%if 1
+    punpckldq xmm0, xmm1
+%else    
+	unpcklps  xmm0,	xmm1
+%endif
+	lea       eax,	[eax+ebx*2]
+	movzx	  ecx,	byte [eax-1]
+	movd	  xmm2,	ecx
+    pmuludq   xmm2, xmm6  
+
+	movzx	  ecx,	byte [eax+ebx-1]
+	movd	  xmm3,	ecx	
+    pmuludq   xmm3, xmm6  
+%if 1
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+%else
+	unpcklps  xmm2,	xmm3
+	unpcklpd  xmm0,	xmm2
+%endif	
+	movdqa	  [edx],xmm0
+	
+	mov       eax, edi
+    mov       ebx, [esp+36]
+	mov       dword [ebx], 0x01
+    
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+not_dc_h:
+    ; for V mode
+    mov       eax,  [esp+16]
+    sub		  eax,	[esp+20]
+	movd	  xmm0,	[eax]
+	pshufd	  xmm0,	xmm0, 0
+	movdqa	  [edx],xmm0
+	
+	mov       eax, esi
+    mov       ebx, [esp+36]
+	mov       dword [ebx], 0x00
+    
+    pop       edi
+    pop       esi
+    pop       ebx
+    ret
+    
+
--- /dev/null
+++ b/codec/encoder/core/asm/intra_pred_util.asm
@@ -1,0 +1,156 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred_util.asm
+;*
+;*  Abstract
+;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and 
+;*		WelsFillingPred1to16 etc.
+;*
+;*  History
+;*      09/29/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata pData align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+
+SECTION .text
+
+WELS_EXTERN WelsFillingPred8to16_mmx
+WELS_EXTERN WelsFillingPred8x2to16_mmx
+WELS_EXTERN WelsFillingPred1to16_mmx
+WELS_EXTERN WelsFillingPred8x2to16_sse2
+WELS_EXTERN WelsFillingPred1to16_sse2
+
+
+ALIGN 16
+;***********************************************************************----------------
+; void WelsFillingPred8to16_mmx( uint8_t *pred, uint8_t *v );
+;***********************************************************************----------------
+WelsFillingPred8to16_mmx:
+	mov eax, [esp+4]	; pred
+	mov ecx, [esp+8]	; v
+
+	movq mm0, [ecx]
+	movq [eax  ], mm0
+	movq [eax+8], mm0
+	
+	WELSEMMS
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void WelsFillingPred8x2to16_mmx( uint8_t *pred, uint8_t *v );
+;***********************************************************************----------------
+WelsFillingPred8x2to16_mmx:
+	mov eax, [esp+4]	; pred
+	mov ecx, [esp+8]	; v
+
+	movq mm0, [ecx  ]
+	movq mm1, [ecx+8]
+	movq [eax  ], mm0
+	movq [eax+8], mm1
+	
+	WELSEMMS
+
+	ret
+
+%macro butterfly_1to8_mmx	3	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+	mov %3h, %3l	
+	movd %2, e%3x		; i.e, 1% = eax (=b0)	
+	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0	
+%endmacro 
+
+ALIGN 16
+;***********************************************************************----------------
+; void WelsFillingPred1to16_mmx( uint8_t *pred, const uint8_t v );
+;***********************************************************************----------------
+WelsFillingPred1to16_mmx:
+	mov eax, [esp+4]		; pred
+
+	mov cl, byte [esp+8]	; v
+	butterfly_1to8_mmx	mm0, mm1, c	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+	
+	movq [eax  ], mm0
+	movq [eax+8], mm0
+	
+	WELSEMMS
+
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void WelsFillingPred8x2to16_sse2( uint8_t *pred, uint8_t *v );
+;***********************************************************************----------------
+WelsFillingPred8x2to16_sse2:
+	mov eax, [esp+4]	; pred
+	mov ecx, [esp+8]	; v
+
+	movdqa xmm0, [ecx]	
+	movdqa [eax], xmm0	
+	
+	ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void WelsFillingPred1to16_sse2( uint8_t *pred, const uint8_t v );
+;***********************************************************************----------------
+WelsFillingPred1to16_sse2:
+	mov eax, [esp+4]		; pred
+
+	mov cl, byte [esp+8]	; v
+	butterfly_1to16_sse	xmm0, xmm1, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
+	
+	movdqa [eax], xmm0
+	
+	ret
--- /dev/null
+++ b/codec/encoder/core/asm/mb_copy.asm
@@ -1,0 +1,687 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mb_copy.asm
+;*
+;*  Abstract
+;*      mb_copy 
+;*
+;*
+;*********************************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN WelsCopy16x16_sse2
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
+WELS_EXTERN WelsCopy8x8_mmx
+WELS_EXTERN WelsCopy16x8NotAligned_sse2	; 
+WELS_EXTERN WelsCopy8x16_mmx		; 
+WELS_EXTERN UpdateMbMv_sse2		; 
+
+;***********************************************************************
+; void WelsCopy16x16_sse2(	uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x16_sse2:
+	push esi
+	push edi
+	push ebx	
+
+	mov edi, [esp+16]	; Dst
+	mov eax, [esp+20]	; iStrideD
+	mov esi, [esp+24]	; Src
+	mov ecx, [esp+28]	; iStrideS
+
+	lea ebx, [eax+2*eax]	; x3
+	lea edx, [ecx+2*ecx]	; x3
+
+	movdqa xmm0, [esi]
+	movdqa xmm1, [esi+ecx]
+	movdqa xmm2, [esi+2*ecx]
+	movdqa xmm3, [esi+edx]
+	lea esi, [esi+4*ecx]
+	movdqa xmm4, [esi]
+	movdqa xmm5, [esi+ecx]
+	movdqa xmm6, [esi+2*ecx]
+	movdqa xmm7, [esi+edx]
+	lea esi, [esi+4*ecx]
+
+	movdqa [edi], xmm0
+	movdqa [edi+eax], xmm1
+	movdqa [edi+2*eax], xmm2
+	movdqa [edi+ebx], xmm3
+	lea edi, [edi+4*eax]
+	movdqa [edi], xmm4
+	movdqa [edi+eax], xmm5
+	movdqa [edi+2*eax], xmm6
+	movdqa [edi+ebx], xmm7
+	lea edi, [edi+4*eax]
+
+	movdqa xmm0, [esi]
+	movdqa xmm1, [esi+ecx]
+	movdqa xmm2, [esi+2*ecx]
+	movdqa xmm3, [esi+edx]
+	lea esi, [esi+4*ecx]
+	movdqa xmm4, [esi]
+	movdqa xmm5, [esi+ecx]
+	movdqa xmm6, [esi+2*ecx]
+	movdqa xmm7, [esi+edx]
+	
+	movdqa [edi], xmm0
+	movdqa [edi+eax], xmm1
+	movdqa [edi+2*eax], xmm2
+	movdqa [edi+ebx], xmm3
+	lea edi, [edi+4*eax]
+	movdqa [edi], xmm4
+	movdqa [edi+eax], xmm5
+	movdqa [edi+2*eax], xmm6
+	movdqa [edi+ebx], xmm7	
+
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+;***********************************************************************
+; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
+WelsCopy16x16NotAligned_sse2:
+	push esi
+	push edi
+	push ebx	
+
+	mov edi, [esp+16]	; Dst
+	mov eax, [esp+20]	; iStrideD
+	mov esi, [esp+24]	; Src
+	mov ecx, [esp+28]	; iStrideS
+
+	lea ebx, [eax+2*eax]	; x3
+	lea edx, [ecx+2*ecx]	; x3
+
+	movdqu xmm0, [esi]
+	movdqu xmm1, [esi+ecx]
+	movdqu xmm2, [esi+2*ecx]
+	movdqu xmm3, [esi+edx]
+	lea esi, [esi+4*ecx]
+	movdqu xmm4, [esi]
+	movdqu xmm5, [esi+ecx]
+	movdqu xmm6, [esi+2*ecx]
+	movdqu xmm7, [esi+edx]
+	lea esi, [esi+4*ecx]
+
+	movdqa [edi], xmm0
+	movdqa [edi+eax], xmm1
+	movdqa [edi+2*eax], xmm2
+	movdqa [edi+ebx], xmm3
+	lea edi, [edi+4*eax]
+	movdqa [edi], xmm4
+	movdqa [edi+eax], xmm5
+	movdqa [edi+2*eax], xmm6
+	movdqa [edi+ebx], xmm7
+	lea edi, [edi+4*eax]
+
+	movdqu xmm0, [esi]
+	movdqu xmm1, [esi+ecx]
+	movdqu xmm2, [esi+2*ecx]
+	movdqu xmm3, [esi+edx]
+	lea esi, [esi+4*ecx]
+	movdqu xmm4, [esi]
+	movdqu xmm5, [esi+ecx]
+	movdqu xmm6, [esi+2*ecx]
+	movdqu xmm7, [esi+edx]
+	
+	movdqa [edi], xmm0
+	movdqa [edi+eax], xmm1
+	movdqa [edi+2*eax], xmm2
+	movdqa [edi+ebx], xmm3
+	lea edi, [edi+4*eax]
+	movdqa [edi], xmm4
+	movdqa [edi+eax], xmm5
+	movdqa [edi+2*eax], xmm6
+	movdqa [edi+ebx], xmm7	
+	
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+; , 12/29/2011
+;***********************************************************************
+; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x8NotAligned_sse2:
+	push esi
+	push edi
+	push ebx	
+
+	mov edi, [esp+16]	; Dst
+	mov eax, [esp+20]	; iStrideD
+	mov esi, [esp+24]	; Src
+	mov ecx, [esp+28]	; iStrideS
+
+	lea ebx, [eax+2*eax]	; x3
+	lea edx, [ecx+2*ecx]	; x3
+
+	movdqu xmm0, [esi]
+	movdqu xmm1, [esi+ecx]
+	movdqu xmm2, [esi+2*ecx]
+	movdqu xmm3, [esi+edx]
+	lea esi, [esi+4*ecx]
+	movdqu xmm4, [esi]
+	movdqu xmm5, [esi+ecx]
+	movdqu xmm6, [esi+2*ecx]
+	movdqu xmm7, [esi+edx]	
+
+	movdqa [edi], xmm0
+	movdqa [edi+eax], xmm1
+	movdqa [edi+2*eax], xmm2
+	movdqa [edi+ebx], xmm3
+	lea edi, [edi+4*eax]
+	movdqa [edi], xmm4
+	movdqa [edi+eax], xmm5
+	movdqa [edi+2*eax], xmm6
+	movdqa [edi+ebx], xmm7
+	
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+;***********************************************************************
+; void WelsCopy8x16_mmx(uint8_t* Dst,
+;                       int32_t  iStrideD,
+;                       uint8_t* Src,
+;                       int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x16_mmx:	
+	push ebx
+
+	mov eax, [esp + 8 ]           ;Dst
+	mov ecx, [esp + 12]           ;iStrideD
+	mov ebx, [esp + 16]           ;Src
+	mov edx, [esp + 20]           ;iStrideS
+
+	movq mm0, [ebx]	
+	movq mm1, [ebx+edx]	
+	lea ebx, [ebx+2*edx]
+	movq mm2, [ebx]	
+	movq mm3, [ebx+edx]	
+	lea ebx, [ebx+2*edx]
+	movq mm4, [ebx]	
+	movq mm5, [ebx+edx]	
+	lea ebx, [ebx+2*edx]
+	movq mm6, [ebx]	
+	movq mm7, [ebx+edx]	
+	lea ebx, [ebx+2*edx]
+	
+	movq [eax], mm0	
+	movq [eax+ecx], mm1	
+	lea eax, [eax+2*ecx]
+	movq [eax], mm2	
+	movq [eax+ecx], mm3
+	lea eax, [eax+2*ecx]
+	movq [eax], mm4	
+	movq [eax+ecx], mm5
+	lea eax, [eax+2*ecx]
+	movq [eax], mm6	
+	movq [eax+ecx], mm7
+	lea eax, [eax+2*ecx]
+
+	movq mm0, [ebx]	
+	movq mm1, [ebx+edx]	
+	lea ebx, [ebx+2*edx]
+	movq mm2, [ebx]	
+	movq mm3, [ebx+edx]	
+	lea ebx, [ebx+2*edx]
+	movq mm4, [ebx]	
+	movq mm5, [ebx+edx]	
+	lea ebx, [ebx+2*edx]
+	movq mm6, [ebx]	
+	movq mm7, [ebx+edx]		
+	
+	movq [eax], mm0	
+	movq [eax+ecx], mm1	
+	lea eax, [eax+2*ecx]
+	movq [eax], mm2	
+	movq [eax+ecx], mm3
+	lea eax, [eax+2*ecx]
+	movq [eax], mm4	
+	movq [eax+ecx], mm5
+	lea eax, [eax+2*ecx]
+	movq [eax], mm6	
+	movq [eax+ecx], mm7	
+
+	WELSEMMS
+	pop ebx	
+	ret
+	
+;***********************************************************************
+; void WelsCopy8x8_mmx(  uint8_t* Dst,
+;                        int32_t  iStrideD,
+;                        uint8_t* Src,
+;                        int32_t  iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x8_mmx:	
+	push ebx
+	push esi
+	mov eax, [esp + 12]           ;Dst
+	mov ecx, [esp + 16]           ;iStrideD
+	mov esi, [esp + 20]           ;Src
+	mov ebx, [esp + 24]           ;iStrideS
+	lea edx, [ebx+2*ebx]
+
+	; to prefetch next loop
+	prefetchnta [esi+2*ebx]
+	prefetchnta [esi+edx]
+	movq mm0, [esi]
+	movq mm1, [esi+ebx]
+	lea esi, [esi+2*ebx]
+	; to prefetch next loop
+	prefetchnta [esi+2*ebx]
+	prefetchnta [esi+edx]
+	movq mm2, [esi]
+	movq mm3, [esi+ebx]
+	lea esi, [esi+2*ebx]
+	; to prefetch next loop
+	prefetchnta [esi+2*ebx]
+	prefetchnta [esi+edx]
+	movq mm4, [esi]
+	movq mm5, [esi+ebx]
+	lea esi, [esi+2*ebx]
+	movq mm6, [esi]
+	movq mm7, [esi+ebx]
+	
+	movq [eax], mm0
+	movq [eax+ecx], mm1
+	lea eax, [eax+2*ecx]
+	movq [eax], mm2
+	movq [eax+ecx], mm3
+	lea eax, [eax+2*ecx]
+	movq [eax], mm4
+	movq [eax+ecx], mm5
+	lea eax, [eax+2*ecx]
+	movq [eax], mm6
+	movq [eax+ecx], mm7
+		
+	WELSEMMS
+	pop esi	
+	pop ebx
+	ret
+	
+; (dunhuang@cisco), 12/21/2011
+;***********************************************************************
+; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
+;***********************************************************************
+ALIGN 16
+UpdateMbMv_sse2:
+	mov eax, [esp+4]	; mv_buffer
+	movd xmm0, [esp+8]	; _mv
+	pshufd xmm1, xmm0, $0
+	movdqa [eax     ], xmm1
+	movdqa [eax+0x10], xmm1
+	movdqa [eax+0x20], xmm1
+	movdqa [eax+0x30], xmm1
+	ret
+
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata pData align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+;read unaligned memory
+%macro SSE2_READ_UNA 2
+	movq	%1, [%2]
+	movhps	%1,	[%2+8]
+%endmacro
+
+;write unaligned memory
+%macro SSE2_WRITE_UNA 2
+	movq	[%1],	%2
+	movhps	[%1+8], %2
+%endmacro
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN PixelAvgWidthEq8_mmx
+WELS_EXTERN PixelAvgWidthEq16_sse2
+
+WELS_EXTERN McCopyWidthEq4_mmx
+WELS_EXTERN McCopyWidthEq8_mmx
+WELS_EXTERN McCopyWidthEq16_sse2
+                          
+
+ALIGN 16
+;***********************************************************************
+; void PixelAvgWidthEq8_mmx( uint8_t *dst,  int32_t iDstStride,
+;                           uint8_t *pSrc1, int32_t iSrc1Stride,
+;                           uint8_t *pSrc2, int32_t iSrc2Stride,
+;                           int32_t iHeight );
+;***********************************************************************
+PixelAvgWidthEq8_mmx:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       
+    mov         esi, [esp+28]       
+    mov         edx, [esp+36]       
+    mov         ebp, [esp+24]       
+    mov         eax, [esp+32]       
+    mov         ebx, [esp+40]       
+    mov         ecx, [esp+44]       
+	sar			ecx, 2
+.height_loop:
+	movq        mm0, [esi]	
+    pavgb       mm0, [edx]
+    movq        [edi], mm0
+	movq		mm1, [esi+eax]		
+	pavgb		mm1, [edx+ebx]
+	movq		[edi+ebp], mm1
+	lea         edi, [edi+2*ebp]
+	lea         esi, [esi+2*eax]
+	lea         edx, [edx+2*ebx]
+
+	movq        mm2, [esi]	
+	pavgb       mm2, [edx]
+    movq        [edi], mm2
+	movq		mm3, [esi+eax]	
+	pavgb		mm3, [edx+ebx]
+	movq		[edi+ebp], mm3
+	lea         edi, [edi+2*ebp]
+	lea         esi, [esi+2*eax]
+	lea         edx, [edx+2*ebx]
+	
+	dec         ecx
+    jne         .height_loop
+	
+	WELSEMMS
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
+ALIGN 16
+;***********************************************************************
+; void PixelAvgWidthEq16_sse2( uint8_t *dst,  int32_t iDstStride,
+;                          uint8_t *pSrc1, int32_t iSrc1Stride,
+;                          uint8_t *pSrc2, int32_t iSrc2Stride,
+;                          int32_t iHeight );
+;***********************************************************************
+PixelAvgWidthEq16_sse2:
+	push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       
+    mov         esi, [esp+28]       
+    mov         edx, [esp+36]       
+    mov         ebp, [esp+24]       
+    mov         eax, [esp+32]       
+    mov         ebx, [esp+40]       
+    mov         ecx, [esp+44]       
+	sar			ecx, 2
+.height_loop:
+	movdqu      xmm0, [esi]
+	movdqu      xmm1, [edx]
+	movdqu      xmm2, [esi+eax]
+	movdqu      xmm3, [edx+ebx]	
+	pavgb       xmm0, xmm1
+	pavgb       xmm2, xmm3
+	movdqu      [edi], xmm0
+	movdqu      [edi+ebp], xmm2
+	lea			edi, [edi+2*ebp]
+	lea			esi, [esi+2*eax]
+	lea			edx, [edx+2*ebx]	
+
+	movdqu      xmm4, [esi]
+	movdqu      xmm5, [edx]
+	movdqu      xmm6, [esi+eax]
+	movdqu      xmm7, [edx+ebx]	
+	pavgb       xmm4, xmm5
+	pavgb       xmm6, xmm7
+	movdqu      [edi], xmm4
+	movdqu      [edi+ebp], xmm6
+	lea         edi, [edi+2*ebp]
+	lea         esi, [esi+2*eax]
+    lea         edx, [edx+2*ebx]	
+    
+	dec         ecx
+	jne         .height_loop
+	
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
+ALIGN 64
+avg_w16_align_0_ssse3:
+    movdqa  xmm1, [ebx]
+    movdqu  xmm2, [ecx]
+    pavgb   xmm1, xmm2
+    movdqa  [edi], xmm1
+    add    ebx, eax
+    add    ecx, ebp
+    add    edi, esi
+    dec    dword [esp+4]
+    jg     avg_w16_align_0_ssse3
+    ret
+    
+    ALIGN 64
+avg_w16_align_1_ssse3:
+    movdqa  xmm1, [ebx+16]
+    movdqu  xmm2, [ecx]
+    palignr xmm1, [ebx], 1
+    pavgb   xmm1, xmm2
+    movdqa  [edi], xmm1
+    add    ebx, eax
+    add    ecx, ebp
+    add    edi, esi
+    dec    dword [esp+4]
+    jg     avg_w16_align_1_ssse3
+    ret
+
+  
+ALIGN 16
+;***********************************************************************
+; void PixelAvgWidthEq16_ssse3(uint8_t *pDst,  int32_t iDstStride,
+;                          uint8_t *pSrc1, int32_t iSrc1Stride,
+;                          uint8_t *pSrc2, int32_t iSrc2Stride,
+;                          int32_t iHeight );
+;***********************************************************************
+WELS_EXTERN PixelAvgWidthEq16_ssse3
+PixelAvgWidthEq16_ssse3:
+    push        ebp
+    push        ebx
+    push        esi
+    push        edi
+
+    mov         edi, [esp+20]       ; dst
+    mov         ebx, [esp+28]       ; src1
+    mov         ecx, [esp+36]       ; src2
+    mov         esi, [esp+24]       ; i_dst_stride
+    
+     %define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
+    mov edx, ebx
+    and edx, 0x01
+    lea eax, [avg_w16_align_0_ssse3]
+    lea ebp, [avg_w16_offset]
+    imul ebp, edx
+    lea edx, [ebp+eax]
+    
+    mov eax, [esp+32]  
+    mov ebp, [esp+44] 
+    push ebp
+    mov ebp, [esp+44]	
+    and ebx, 0xfffffff0
+    call edx
+	pop		   ebp
+    pop         edi
+    pop         esi
+    pop         ebx
+    pop         ebp
+    ret
+
+
+ALIGN 16
+;*******************************************************************************
+;  void McCopyWidthEq4_mmx( uint8_t *pSrc, int32_t iSrcStride,
+;                          uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
+;*******************************************************************************
+McCopyWidthEq4_mmx:
+    push    esi
+    push    edi
+    push    ebx
+
+    
+    mov esi,  [esp+16]
+    mov eax, [esp+20]
+    mov edi,  [esp+24]
+    mov ecx,  [esp+28]
+    mov edx,  [esp+32]
+ALIGN 4
+.height_loop:
+	mov ebx, [esi]
+	mov [edi], ebx
+	
+	add esi, eax
+	add edi, ecx
+	dec edx
+	jnz .height_loop
+	WELSEMMS   
+	pop	   ebx
+    pop     edi
+    pop     esi
+    ret
+
+ALIGN 16
+;*******************************************************************************
+;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int32_t iSrcStride,
+;                           uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
+;*******************************************************************************
+McCopyWidthEq8_mmx:
+    push  esi
+    push  edi
+	mov  esi, [esp+12]
+	mov eax, [esp+16]
+	mov edi, [esp+20]
+	mov ecx, [esp+24]
+	mov edx, [esp+28]
+
+ALIGN 4
+.height_loop:
+	movq mm0, [esi]
+	movq [edi], mm0
+	add esi, eax
+	add edi, ecx
+	dec edx
+	jnz .height_loop
+	
+	WELSEMMS   
+    pop     edi
+    pop     esi
+    ret
+	
+ALIGN 16
+;***********************************************************************
+;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
+;***********************************************************************
+McCopyWidthEq16_sse2:
+    push    esi
+    push    edi
+
+    mov     esi, [esp+12]       
+    mov     eax, [esp+16]       
+    mov     edi, [esp+20]       
+    mov     edx, [esp+24]       
+    mov     ecx, [esp+28]       
+
+ALIGN 4
+.height_loop:
+    SSE2_READ_UNA	xmm0, esi
+    SSE2_READ_UNA	xmm1, esi+eax
+    SSE2_WRITE_UNA	edi, xmm0
+    SSE2_WRITE_UNA	edi+edx, xmm1
+
+	sub		ecx,	2
+    lea     esi, [esi+eax*2]
+    lea     edi, [edi+edx*2]
+    jnz     .height_loop
+  
+    pop     edi
+    pop     esi
+    ret
--- /dev/null
+++ b/codec/encoder/core/asm/mc_chroma.asm
@@ -1,0 +1,317 @@
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src, 
+;							int32_t iSrcStride, 
+;							uint8_t *pDst, 
+;							int32_t iDstStride, 
+;							uint8_t *pABCD, 
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+	
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3       
+	punpckhwd mm4, mm4		 
+	
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+	
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+	
+	mov esi, [esp +12+ 4]   
+	mov eax, [esp + 12 + 8]   
+	mov edi, [esp + 12 + 12]  
+	mov edx, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+		
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+	
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+	
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+	
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+	
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0	
+
+	movq mm0, mm2
+	
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
+;						int32_t iSrcStride, 
+;						uint8_t *pDst, 
+;						int32_t iDstStride, 
+;						uint8_t *pABCD, 
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+	
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+	
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+	
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+	
+	mov esi, [esp +12+ 4]   
+	mov eax, [esp + 12 + 8]   
+	mov edi, [esp + 12 + 12]  
+	mov edx, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+		
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+	
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+	
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+	
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+	
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0	
+
+	movdqa xmm0, xmm2
+	
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride, 
+;                        uint8_t *pDst,  
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+		
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]   
+    punpcklwd xmm5, xmm5  
+    punpckldq xmm5, xmm5 
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6    
+    
+	mov eax, [esp + 12 + 4]   
+	mov edx, [esp + 12 + 8]   
+	mov esi, [esp + 12 + 12]  
+	mov edi, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+    
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+	
+.hloop_chroma:	
+	lea	esi, [esi+2*edi]
+	
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+	
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0	
+    
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+    
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4	
+	
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- /dev/null
+++ b/codec/encoder/core/asm/mc_luma.asm
@@ -1,0 +1,1052 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_luma.asm
+;*
+;*  Abstract
+;*      sse2 motion compensation
+;*
+;*  History
+;*      17/08/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+%macro SSE_LOAD_8P 3
+	movq %1, %3
+	punpcklbw %1, %2
+%endmacro
+
+%macro FILTER_HV_W8 9
+	paddw	%1, %6
+	movdqa	%8, %3
+	movdqa	%7, %2
+	paddw	%1, [h264_w0x10_1]
+	paddw	%8, %4
+	paddw	%7, %5
+	psllw	%8, 2
+	psubw	%8, %7
+	paddw	%1, %8
+	psllw	%8, 2
+	paddw	%1, %8
+	psraw   %1, 5
+	WELS_Zero %8
+	packuswb %1, %8
+	movq    %9, %1
+%endmacro
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
+WELS_EXTERN McHorVer02WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq16_sse2
+
+ALIGN 16
+;***********************************************************************
+; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
+;								int32_t iSrcStride, 
+;								uint8_t *pDst, 
+;								int32_t iDstStride, 
+;								int32_t iHeight,
+;                      );
+;***********************************************************************
+McHorVer20WidthEq16_sse2:
+	push	esi
+	push	edi
+	
+
+	mov esi, [esp + 12]         
+	mov eax, [esp + 16]         
+	mov edi, [esp + 20]         
+	mov ecx, [esp + 28]         
+	mov edx, [esp + 24]			
+	sub esi, 2                  
+	
+	WELS_Zero  xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+	
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [edi], xmm0
+
+	movq xmm0, [esi+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3+8]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [edi+8], xmm0
+	
+	
+	add esi, eax
+	add edi, edx
+	dec ecx
+	jnz .y_loop
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;***********************************************************************
+; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc, 
+;									int32_t iSrcStride, 
+;									uint8_t* pTap,	
+;									int32_t iTapStride,
+;									int32_t iHeight);
+;***********************************************************************
+McHorVer22Width8HorFirst_sse2:
+	push esi
+	push edi
+	push ebx
+	mov esi, [esp+16]     ;pSrc
+	mov eax, [esp+20]	;src_stride
+	mov edi, [esp+24]		;tap
+	mov edx, [esp+28]	;tap_stride
+	mov ebx, [esp+32]	;i_height
+	pxor xmm7, xmm7	
+	
+	sub esi, eax				;;;;;;;;need more 5 lines.
+	sub esi, eax
+		
+.yloop_width_8:
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [edi], xmm0
+		
+	add esi, eax
+	add edi, edx
+	dec ebx
+	jnz .yloop_width_8
+	pop ebx
+	pop edi
+	pop esi
+	ret
+	
+;***********************************************************************
+; void McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
+;                       int32_t iSrcStride, 
+;                       uint8_t *pDst, 
+;                       int32_t iDstStride, 
+;                       int32_t iHeight )
+;***********************************************************************
+ALIGN 16
+McHorVer02WidthEq8_sse2:
+	push esi
+	push edi
+	
+	mov esi, [esp + 12]           
+	mov edx, [esp + 16]	          
+	mov edi, [esp + 20]           
+	mov eax, [esp + 24]           
+	mov ecx, [esp + 28]           
+
+	sub esi, edx
+	sub esi, edx
+
+	WELS_Zero xmm7
+			
+	SSE_LOAD_8P xmm0, xmm7, [esi]
+	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm2, xmm7, [esi]
+	SSE_LOAD_8P xmm3, xmm7, [esi+edx]
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm4, xmm7, [esi]
+	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
+	
+.start:	
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	jz near .xx_exit
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm6, xmm7, [esi]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
+	dec ecx
+	jz near .xx_exit
+	
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
+	dec ecx
+	jz near .xx_exit
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm0, xmm1, [esi]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
+	dec ecx
+	jz near .xx_exit
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm1, xmm2, [esi+edx]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
+	dec ecx
+	jz near .xx_exit
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm2, xmm3, [esi]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
+	dec ecx
+	jz near .xx_exit
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm3, xmm4, [esi+edx]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
+	dec ecx
+	jz near .xx_exit
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm4, xmm5, [esi]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
+	dec ecx
+	jz near .xx_exit
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
+	jmp near .start
+
+.xx_exit:
+	pop edi
+	pop esi
+	ret
+
+
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+ALIGN 16
+h264_w0x10_1:
+	dw 16, 16, 16, 16, 16, 16, 16, 16
+ALIGN 16
+h264_mc_hc_32:
+	dw 32, 32, 32, 32, 32, 32, 32, 32
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20_sse2
+WELS_EXTERN McHorVer02_sse2
+WELS_EXTERN McHorVer22VerLastAlign_sse2
+WELS_EXTERN McHorVer22VerLastUnAlign_sse2
+WELS_EXTERN McHorVer22HorFirst_sse2
+
+
+;***********************************************************************
+; void McHorVer02_sse2(	uint8_t *pSrc, 
+;                       int32_t iSrcStride, 
+;                       uint8_t *pDst, 
+;                       int32_t iDstStride,
+;						int32_t iWidth, 
+;                       int32_t iHeight )
+;***********************************************************************
+ALIGN 16
+McHorVer02_sse2:
+	push esi
+	push edi
+	push ebx
+	
+	mov esi, [esp + 16]           
+	mov edx, [esp + 20]	          
+	mov edi, [esp + 24]           
+	mov eax, [esp + 28]           
+	mov ecx, [esp + 36]           
+	mov ebx, [esp + 32]			  
+	shr ebx, 3
+	sub esi, edx
+	sub esi, edx
+	
+.xloop:	
+	WELS_Zero xmm7			
+	SSE_LOAD_8P xmm0, xmm7, [esi]
+	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm2, xmm7, [esi]
+	SSE_LOAD_8P xmm3, xmm7, [esi+edx]
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm4, xmm7, [esi]
+	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
+	
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm6, xmm7, [esi]
+	movdqa xmm0,xmm1
+	movdqa xmm1,xmm2
+	movdqa xmm2,xmm3
+	movdqa xmm3,xmm4
+	movdqa xmm4,xmm5
+	movdqa xmm5,xmm6
+	add edi, eax
+	sub esi, edx
+	
+.start:	
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	jz near .x_loop_dec
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm6, xmm7, [esi]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
+	dec ecx
+	jz near .x_loop_dec
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm0, xmm1, [esi]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
+	dec ecx
+	jz near .x_loop_dec
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm1, xmm2, [esi+edx]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
+	dec ecx
+	jz near .x_loop_dec
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm2, xmm3, [esi]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
+	dec ecx
+	jz near .x_loop_dec
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm3, xmm4, [esi+edx]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
+	dec ecx
+	jz near .x_loop_dec
+
+	lea esi, [esi+2*edx]
+	SSE_LOAD_8P xmm4, xmm5, [esi]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
+	dec ecx
+	jz near .x_loop_dec
+
+	lea edi, [edi+2*eax]
+	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
+	jmp near .start
+
+.x_loop_dec:	
+	dec ebx
+	jz  near .xx_exit
+	mov esi, [esp + 16]           
+	mov edi, [esp + 24]           
+	sub esi, edx
+	sub esi, edx
+	add esi, 8
+	add edi, 8
+	mov ecx, [esp + 36] 
+	jmp near .xloop
+
+.xx_exit:
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16                  
+;***********************************************************************
+; void McHorVer20_sse2(		uint8_t *pSrc, 
+;                       int32_t iSrcStride, 
+;						uint8_t *pDst, 
+;						int32_t iDstStride, 
+;						int32_t iWidth,
+;						int32_t iHeight
+;                      );
+;***********************************************************************
+McHorVer20_sse2:
+	push esi
+	push edi
+	push ebx
+	mov esi, [esp+16]     
+	mov eax, [esp+20]	
+	mov edi, [esp+24]	
+	mov edx, [esp+28]	
+	mov ecx, [esp+32]	
+	mov ebx, [esp+36]	
+	sub esi, 2
+	pxor xmm7, xmm7	
+	
+	cmp ecx, 9
+	jne near .width_17	
+	
+.yloop_width_9:	
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movd [edi], xmm0	
+	
+	pxor  xmm7, xmm7
+	movq xmm0, [esi+6]
+	punpcklbw xmm0, xmm7
+	
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	paddw xmm2, [h264_w0x10_1]
+	psraw  xmm2, 5
+	packuswb xmm2, xmm2
+	movq [edi+1], xmm2	
+		
+	add esi, eax
+	add edi, edx
+	dec ebx
+	jnz .yloop_width_9
+	pop ebx
+	pop edi
+	pop esi
+	ret
+	
+	
+.width_17:
+.yloop_width_17:
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movq [edi], xmm0
+		
+	movq xmm0, [esi+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3+8]
+	punpcklbw xmm5, xmm7
+	
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movd [edi+8], xmm0
+	
+	
+	pxor  xmm7, xmm7
+	movq xmm0, [esi+6+8]
+	punpcklbw xmm0, xmm7
+	
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	paddw xmm2, [h264_w0x10_1]
+	psraw  xmm2, 5
+	packuswb xmm2, xmm2
+	movq [edi+9], xmm2		
+	add esi, eax
+	add edi, edx
+	dec ebx
+	jnz .yloop_width_17
+	pop ebx
+	pop edi
+	pop esi
+	ret
+	
+	
+
+ALIGN 16
+;***********************************************************************
+;void McHorVer22HorFirst_sse2
+;							(uint8_t *pSrc, 
+;							int32_t iSrcStride, 
+;							uint8_t * pTap,
+;							int32_t iTapStride,
+;							int32_t iWidth,int32_t iHeight);
+;***********************************************************************
+McHorVer22HorFirst_sse2:
+	push esi
+	push edi
+	push ebx
+	mov esi, [esp+16]     
+	mov eax, [esp+20]	
+	mov edi, [esp+24]	
+	mov edx, [esp+28]	
+	mov ecx, [esp+32]	
+	mov ebx, [esp+36]	
+	pxor xmm7, xmm7	
+	
+	sub esi, eax				;;;;;;;;need more 5 lines.
+	sub esi, eax
+	
+	cmp ecx, 9
+	jne near .width_17	
+	
+.yloop_width_9:	
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	movd [edi], xmm0	
+	
+	pxor  xmm7, xmm7
+	movq xmm0, [esi+6]
+	punpcklbw xmm0, xmm7
+	
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	movq [edi+2], xmm2	
+	movhps [edi+2+8], xmm2	
+	
+	add esi, eax
+	add edi, edx
+	dec ebx
+	jnz .yloop_width_9
+	pop ebx
+	pop edi
+	pop esi
+	ret
+	
+	
+.width_17:
+.yloop_width_17:
+	movq xmm0, [esi]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3]
+	punpcklbw xmm5, xmm7
+	
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [edi], xmm0
+		
+	movq xmm0, [esi+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [esi+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [esi+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [esi+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [esi+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [esi+3+8]
+	punpcklbw xmm5, xmm7
+	
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	movd [edi+16], xmm0
+	
+	
+	pxor  xmm7, xmm7
+	movq xmm0, [esi+6+8]
+	punpcklbw xmm0, xmm7
+	
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	movq [edi+18], xmm2	
+	movhps [edi+18+8], xmm2	
+	
+	add esi, eax
+	add edi, edx
+	dec ebx
+	jnz .yloop_width_17
+	pop ebx
+	pop edi
+	pop esi
+	ret
+	
+	
+%macro FILTER_VER 9
+	paddw  %1, %6
+	movdqa %7, %2
+	movdqa %8, %3
+	
+	
+	paddw %7, %5
+	paddw %8, %4
+	
+	psubw  %1, %7   
+	psraw   %1, 2	  
+	paddw  %1, %8   
+	psubw  %1, %7 
+	psraw   %1, 2	
+	paddw  %8, %1   
+	paddw  %8, [h264_mc_hc_32]
+	psraw   %8, 6
+	packuswb %8, %8
+	movq %9, %8
+%endmacro
+;***********************************************************************
+;void McHorVer22VerLastAlign_sse2(
+;											uint8_t *pTap, 
+;											int32_t iTapStride, 
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22VerLastAlign_sse2:
+	push esi
+	push edi
+	push ebx
+	push ebp
+	
+	mov esi, [esp+20]
+	mov eax, [esp+24]
+	mov edi, [esp+28]
+	mov edx, [esp+32]
+	mov ebx, [esp+36]
+	mov ecx, [esp+40]	
+	shr ebx, 3	
+	
+.width_loop:
+	movdqa xmm0, [esi]
+	movdqa xmm1, [esi+eax]
+	lea esi, [esi+2*eax]
+	movdqa xmm2, [esi]
+	movdqa xmm3, [esi+eax]
+	lea esi, [esi+2*eax]
+	movdqa xmm4, [esi]
+	movdqa xmm5, [esi+eax]
+	
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	lea esi, [esi+2*eax]
+	movdqa xmm6, [esi]
+	
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+	
+	add edi, edx
+	sub esi, eax		
+	
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqa xmm6, [esi]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqa xmm7, [esi+eax]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqa xmm0, [esi]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqa xmm1, [esi+eax]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqa xmm2, [esi]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqa xmm3, [esi+eax]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqa xmm4, [esi]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqa xmm5, [esi+eax]
+	jmp near .start
+	
+.x_loop_dec:
+	dec ebx
+	jz near .exit
+	mov esi, [esp+20]
+	mov edi, [esp+28]
+	mov ecx, [esp+40]
+	add esi, 16
+	add edi, 8
+	jmp .width_loop
+	
+	
+	
+.exit:
+	pop ebp
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+;***********************************************************************
+;void McHorVer22VerLastUnAlign_sse2(
+;											uint8_t *pTap, 
+;											int32_t iTapStride, 
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22VerLastUnAlign_sse2:
+	push esi
+	push edi
+	push ebx
+	push ebp
+	
+	mov esi, [esp+20]
+	mov eax, [esp+24]
+	mov edi, [esp+28]
+	mov edx, [esp+32]
+	mov ebx, [esp+36]
+	mov ecx, [esp+40]	
+	shr ebx, 3	
+	
+.width_loop:
+	movdqu xmm0, [esi]
+	movdqu xmm1, [esi+eax]
+	lea esi, [esi+2*eax]
+	movdqu xmm2, [esi]
+	movdqu xmm3, [esi+eax]
+	lea esi, [esi+2*eax]
+	movdqu xmm4, [esi]
+	movdqu xmm5, [esi+eax]
+	
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	lea esi, [esi+2*eax]
+	movdqu xmm6, [esi]
+	
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+	
+	add edi, edx
+	sub esi, eax		
+	
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqu xmm6, [esi]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqu xmm7, [esi+eax]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqu xmm0, [esi]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqu xmm1, [esi+eax]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqu xmm2, [esi]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqu xmm3, [esi+eax]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea esi, [esi+2*eax]
+	movdqu xmm4, [esi]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
+	dec ecx
+	jz near .x_loop_dec
+	
+	lea edi, [edi+2*edx]
+	movdqu xmm5, [esi+eax]
+	jmp near .start
+	
+.x_loop_dec:
+	dec ebx
+	jz near .exit
+	mov esi, [esp+20]
+	mov edi, [esp+28]
+	mov ecx, [esp+40]
+	add esi, 16
+	add edi, 8
+	jmp .width_loop
+	
+	
+	
+.exit:
+	pop ebp
+	pop ebx
+	pop edi
+	pop esi
+	ret
--- /dev/null
+++ b/codec/encoder/core/asm/memzero.asm
@@ -1,0 +1,135 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  memzero.asm
+;*
+;*  Abstract
+;*      
+;*
+;*  History
+;*      9/16/2009 Created
+;*
+;*
+;*************************************************************************/
+
+BITS 32
+
+%include "asm_inc.asm"
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text			
+		
+ALIGN 16
+;***********************************************************************
+;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
+;***********************************************************************
+WELS_EXTERN WelsPrefetchZero_mmx
+WelsPrefetchZero_mmx:
+	mov  eax,[esp+4]
+	prefetchnta [eax]
+	ret 			
+
+
+ALIGN 16
+;***********************************************************************
+;   void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroAligned64_sse2
+WelsSetMemZeroAligned64_sse2:
+		mov		eax,	[esp + 4]          ; dst
+		mov		ecx,	[esp + 8]
+		neg		ecx
+			
+		pxor	xmm0,		xmm0
+.memzeroa64_sse2_loops:
+		movdqa	[eax],		xmm0
+		movdqa	[eax+16],	xmm0
+		movdqa	[eax+32],	xmm0
+		movdqa	[eax+48],	xmm0
+		add		eax, 0x40
+		
+		add ecx, 0x40
+		jnz near .memzeroa64_sse2_loops
+			
+		ret	
+
+ALIGN 16
+;***********************************************************************
+;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroSize64_mmx
+WelsSetMemZeroSize64_mmx:
+		mov		eax,	[esp + 4]          ; dst
+		mov		ecx,	[esp + 8]
+		neg		ecx
+			
+		pxor	mm0,		mm0
+.memzero64_mmx_loops:
+		movq	[eax],		mm0
+		movq	[eax+8],	mm0
+		movq	[eax+16],	mm0
+		movq	[eax+24],	mm0
+		movq	[eax+32],	mm0
+		movq	[eax+40],	mm0
+		movq	[eax+48],	mm0
+		movq	[eax+56],	mm0		
+		add		eax,		0x40
+		
+		add ecx, 0x40
+		jnz near .memzero64_mmx_loops
+			
+		WELSEMMS	
+		ret	
+	
+ALIGN 16		
+;***********************************************************************
+;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
+;***********************************************************************
+WELS_EXTERN WelsSetMemZeroSize8_mmx
+WelsSetMemZeroSize8_mmx:
+		mov		eax,	[esp + 4]		; dst
+		mov		ecx,	[esp + 8]		; size
+		neg		ecx			
+		pxor	mm0,		mm0
+		
+.memzero8_mmx_loops:
+		movq	[eax],		mm0
+		add		eax,		0x08
+	
+		add		ecx,		0x08
+		jnz near .memzero8_mmx_loops
+		
+		WELSEMMS	
+		ret	
+
+							
--- /dev/null
+++ b/codec/encoder/core/asm/quant.asm
@@ -1,0 +1,394 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  quant.asm
+;*
+;*  Abstract
+;*      sse2 quantize inter-block
+;*
+;*  History
+;*      7/6/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+SECTION .text	
+;************************************************
+;NEW_QUANT 
+;************************************************
+
+%macro SSE2_Quant8  5
+		MOVDQ	%1, %5
+		pxor	%2, %2							
+		pcmpgtw	%2, %1							
+		pxor	%1, %2							
+		psubw	%1, %2							
+		paddusw	%1, %3
+		pmulhuw	%1, %4
+		pxor	%1, %2
+		psubw	%1, %2
+		MOVDQ	%5, %1
+%endmacro
+
+%macro SSE2_QuantMax8  6
+		MOVDQ	%1, %5
+		pxor	%2, %2							
+		pcmpgtw	%2, %1							
+		pxor	%1, %2							
+		psubw	%1, %2								
+		paddusw	%1, %3
+		pmulhuw	%1, %4
+		pmaxsw	%6, %1
+		pxor	%1, %2
+		psubw	%1, %2
+		MOVDQ	%5, %1
+%endmacro
+
+%define pDct				esp + 4
+%define ff					esp + 8
+%define mf					esp + 12
+%define max					esp + 16
+;***********************************************************************
+;	void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
+;***********************************************************************
+WELS_EXTERN WelsQuant4x4_sse2
+align 16
+WelsQuant4x4_sse2:
+		mov		eax,  [ff]		
+		mov		ecx,  [mf]			
+		MOVDQ	xmm2, [eax]
+		MOVDQ	xmm3, [ecx]
+		
+		mov		edx,  [pDct]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+
+		ret
+	
+;***********************************************************************
+;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsQuant4x4Dc_sse2
+align 16
+WelsQuant4x4Dc_sse2:
+		mov		ax,		[mf]
+		SSE2_Copy8Times xmm3, eax						
+		
+		mov		cx, [ff]
+		SSE2_Copy8Times xmm2, ecx						
+
+		mov		edx,  [pDct]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+				
+		ret		
+		
+;***********************************************************************
+;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
+;***********************************************************************
+WELS_EXTERN WelsQuantFour4x4_sse2
+align 16
+WelsQuantFour4x4_sse2:
+		mov		eax,  [ff]		
+		mov		ecx,  [mf]			
+		MOVDQ	xmm2, [eax]
+		MOVDQ	xmm3, [ecx]
+		
+		mov		edx,  [pDct]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]	
+
+		ret
+
+;***********************************************************************
+;	void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
+;***********************************************************************
+WELS_EXTERN WelsQuantFour4x4Max_sse2
+align 16
+WelsQuantFour4x4Max_sse2:
+		mov		eax,  [ff]		
+		mov		ecx,  [mf]			
+		MOVDQ	xmm2, [eax]
+		MOVDQ	xmm3, [ecx]
+		
+		mov		edx,  [pDct]		
+		pxor	xmm4, xmm4
+		pxor	xmm5, xmm5
+		pxor	xmm6, xmm6
+		pxor	xmm7, xmm7
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4		
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x40], xmm6
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
+		
+		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
+		pmaxsw  xmm0,  xmm4	
+		pmaxsw  xmm0,  xmm5
+		pmaxsw  xmm0,  xmm7			
+		movdqa	xmm1,  xmm0
+		punpckhqdq	xmm0, xmm1
+		pmaxsw	xmm0, xmm1
+
+		mov		edx,  [max]	
+		movq	[edx], xmm0	
+			
+		ret		
+
+%macro  MMX_Copy4Times 2
+		movd		%1, %2
+		punpcklwd	%1, %1
+		punpckldq	%1,	%1
+%endmacro
+
+SECTION .text
+
+%macro MMX_Quant4  4
+		pxor	%2, %2							
+		pcmpgtw	%2, %1							
+		pxor	%1, %2							
+		psubw	%1, %2							
+		paddusw	%1, %3
+		pmulhuw	%1, %4
+		pxor	%1, %2
+		psubw	%1, %2
+%endmacro
+
+%define dct2x2				esp + 16
+%define iChromaDc			esp + 20
+;***********************************************************************
+;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
+;***********************************************************************
+WELS_EXTERN WelsHadamardQuant2x2_mmx
+align 16
+WelsHadamardQuant2x2_mmx:
+
+		mov			eax,			[pDct]
+		movd		mm0,			[eax]
+		movd		mm1,			[eax + 0x20]
+		punpcklwd	mm0,			mm1
+		movd		mm3,			[eax + 0x40]
+		movd		mm1,			[eax + 0x60]
+		punpcklwd	mm3,			mm1
+		
+		mov			cx,				0
+		mov			[eax],			cx
+		mov			[eax + 0x20],	cx
+		mov			[eax + 0x40],	cx
+		mov			[eax + 0x60],	cx
+		
+		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
+		movq		mm5,			mm3
+		paddw		mm3,			mm0
+		psubw		mm0,			mm5
+		punpcklwd	mm3,			mm0
+		movq		mm1,			mm3
+		psrlq		mm1,			32
+		movq		mm5,			mm1
+		paddw		mm1,			mm3
+		psubw		mm3,			mm5
+		punpcklwd	mm1,			mm3
+		
+		;quant_2x2_dc
+		mov			ax,				[mf]
+		MMX_Copy4Times	mm3,		eax		
+		mov			cx,				[ff]
+		MMX_Copy4Times	mm2,		ecx
+		MMX_Quant4		mm1,	mm0,	mm2,	mm3
+		
+		; store dct_2x2
+		mov			edx,			[dct2x2]	
+		movq		[edx],			mm1
+		mov			ecx,			[iChromaDc]
+		movq		[ecx],			mm1
+		
+		; pNonZeroCount of dct_2x2
+		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pxor		mm3,			mm3
+		packsswb	mm1,			mm3
+		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
+		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
+		psadbw		mm1,			mm3		;
+		movd		eax,			mm1
+					
+		WELSEMMS
+		ret
+	
+;***********************************************************************
+;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
+align 16
+WelsHadamardQuant2x2Skip_mmx:
+
+		mov			eax,			[pDct]
+		movd		mm0,			[eax]
+		movd		mm1,			[eax + 0x20]
+		punpcklwd	mm0,			mm1
+		movd		mm3,			[eax + 0x40]
+		movd		mm1,			[eax + 0x60]
+		punpcklwd	mm3,			mm1
+		
+		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
+		movq		mm5,			mm3
+		paddw		mm3,			mm0
+		psubw		mm0,			mm5
+		punpcklwd	mm3,			mm0
+		movq		mm1,			mm3
+		psrlq		mm1,			32
+		movq		mm5,			mm1
+		paddw		mm1,			mm3
+		psubw		mm3,			mm5
+		punpcklwd	mm1,			mm3
+		
+		;quant_2x2_dc
+		mov			ax,				[mf]
+		MMX_Copy4Times	mm3,		eax		
+		mov			cx,				[ff]
+		MMX_Copy4Times	mm2,		ecx
+		MMX_Quant4		mm1,	mm0,	mm2,	mm3
+		
+		; pNonZeroCount of dct_2x2
+		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pxor		mm3,			mm3
+		packsswb	mm1,			mm3
+		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
+		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
+		psadbw		mm1,			mm3		;
+		movd		eax,			mm1
+			
+		WELSEMMS		
+		ret	
+		
+		
+%macro SSE2_DeQuant8 3  
+    MOVDQ  %2, %1
+    pmullw %2, %3
+    MOVDQ  %1, %2
+%endmacro 
+
+
+ALIGN  16
+;***********************************************************************
+; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
+;***********************************************************************
+align 16
+WELS_EXTERN WelsDequant4x4_sse2
+WelsDequant4x4_sse2:
+	;ecx = dequant_mf[qp], edx = pDct
+	mov		ecx,  [esp + 8]
+	mov		edx,  [esp + 4]
+
+	movdqa  xmm1, [ecx]
+	SSE2_DeQuant8 [edx		],  xmm0, xmm1
+	SSE2_DeQuant8 [edx+0x10	],  xmm0, xmm1
+
+    ret
+
+;***********************************************************************====
+;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
+;***********************************************************************====
+    
+align 16
+
+WELS_EXTERN WelsDequantFour4x4_sse2
+WelsDequantFour4x4_sse2:
+    ;ecx = dequant_mf[qp], edx = pDct
+	mov		ecx,  [esp + 8]
+	mov		edx,  [esp + 4]
+
+	movdqa  xmm1, [ecx]
+	SSE2_DeQuant8 [edx		],  xmm0, xmm1
+	SSE2_DeQuant8 [edx+0x10	],  xmm0, xmm1
+	SSE2_DeQuant8 [edx+0x20	],  xmm0, xmm1
+	SSE2_DeQuant8 [edx+0x30	],  xmm0, xmm1
+	SSE2_DeQuant8 [edx+0x40	],  xmm0, xmm1
+	SSE2_DeQuant8 [edx+0x50	],  xmm0, xmm1
+	SSE2_DeQuant8 [edx+0x60	],  xmm0, xmm1
+	SSE2_DeQuant8 [edx+0x70	],  xmm0, xmm1
+
+    ret
+
+;***********************************************************************
+;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
+;***********************************************************************
+WELS_EXTERN WelsDequantIHadamard4x4_sse2
+align 16
+WelsDequantIHadamard4x4_sse2:
+		mov			eax,			[esp + 4]				
+		mov			cx,				[esp + 8]
+		
+		; WelsDequantLumaDc4x4
+		SSE2_Copy8Times	xmm1,		ecx		
+		;psrlw		xmm1,		2		; for the (>>2) in ihdm
+		MOVDQ		xmm0,		[eax]
+		MOVDQ		xmm2,		[eax+0x10]
+		pmullw		xmm0,		xmm1		
+		pmullw		xmm2,		xmm1
+
+		; ihdm_4x4
+		movdqa		xmm1,		xmm0
+		psrldq		xmm1,		8
+		movdqa		xmm3,		xmm2
+		psrldq		xmm3,		8		
+		
+		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3	
+		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2														
+		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
+		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
+
+		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4		
+		SSE2_SumSub		xmm2, xmm4,	xmm5		
+		SSE2_SumSub		xmm1, xmm0, xmm5																		
+		SSE2_SumSub		xmm4, xmm0, xmm5							
+		SSE2_SumSub		xmm2, xmm1, xmm5 
+		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
+		
+		punpcklqdq	xmm0,		xmm1
+		MOVDQ		[eax],		xmm0
+		
+		punpcklqdq	xmm2,		xmm3
+		MOVDQ		[eax+16],	xmm2			
+		ret
+	
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -1,0 +1,2189 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  satd_sad.asm
+;*
+;*  Abstract
+;*      WelsSampleSatd4x4_sse2
+;*      WelsSampleSatd8x8_sse2
+;*      WelsSampleSatd16x8_sse2
+;*      WelsSampleSatd8x16_sse2
+;*      WelsSampleSatd16x16_sse2
+;*      
+;*      WelsSampleSad16x8_sse2
+;*      WelsSampleSad16x16_sse2
+;*
+;*  History
+;*      8/5/2009 Created
+;*     24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1:  dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2:  dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubw %1, %2
+%endmacro
+
+%macro  SSE2_SumWHorizon1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
+   SSE2_SumSub %1, %2, %5
+   SSE2_SumSub %3, %4, %5 
+   SSE2_SumSub %2, %4, %5 
+   SSE2_SumSub %1, %3, %5 
+%endmacro 
+
+%macro SSE2_SumAbs4 7  
+	WELS_AbsW %1, %3
+	WELS_AbsW %2, %3
+	WELS_AbsW %4, %6
+	WELS_AbsW %5, %6
+	paddusw       %1, %2
+	paddusw       %4, %5
+	paddusw       %7, %1
+	paddusw       %7, %4
+%endmacro 
+
+%macro  SSE2_SumWHorizon 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[eax],[ecx]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx]
+	lea                 eax, [eax+2*ebx]
+	lea                 ecx, [ecx+2*edx]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
+	
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5 
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+	
+	lea					eax,    [eax+2*ebx]
+    lea					ecx,    [ecx+2*edx]
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[eax],[ecx]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx]
+	lea                 eax, [eax+2*ebx]
+	lea                 ecx, [ecx+2*edx]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
+	
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5 
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6	
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+align 16
+WelsSampleSatd4x4_sse2:
+	push      ebx
+	mov       eax,  [esp+8]
+	mov       ebx,  [esp+12]
+	mov       ecx,  [esp+16]
+	mov       edx,  [esp+20]    
+	
+    movd      xmm0, [eax]
+    movd      xmm1, [eax+ebx]
+    lea       eax , [eax+2*ebx]
+    movd      xmm2, [eax]
+    movd      xmm3, [eax+ebx]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+   
+    movd      xmm4, [ecx]
+    movd      xmm5, [ecx+edx]
+    lea       ecx , [ecx+2*edx]
+    movd      xmm6, [ecx]
+    movd      xmm7, [ecx+edx]
+    punpckldq xmm4, xmm6
+    punpckldq xmm5, xmm7
+
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+    punpcklbw xmm4, xmm6
+    punpcklbw xmm5, xmm6
+    
+    psubw     xmm0, xmm4
+    psubw     xmm1, xmm5
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp qdq, xmm0, xmm2, xmm3
+    
+    movdqa     xmm4, xmm0
+    paddw      xmm0, xmm3
+    psubw      xmm4, xmm3
+
+    movdqa         xmm2, xmm0
+    punpcklwd      xmm0, xmm4
+    punpckhwd      xmm4, xmm2
+    
+	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
+
+    movdqa         xmm7, xmm0
+    paddw          xmm0, xmm5
+    psubw          xmm7, xmm5
+    
+	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
+
+    movdqa         xmm2, xmm0
+    paddw          xmm0, xmm1
+    psubw          xmm2, xmm1
+    
+    WELS_AbsW  xmm0, xmm3   
+    paddusw        xmm6, xmm0
+	WELS_AbsW  xmm2, xmm4   
+    paddusw        xmm6, xmm2
+    SSE2_SumWHorizon1  xmm6, xmm4
+	movd           eax,  xmm6
+    and            eax,  0xffff
+    shr            eax,  1
+	pop            ebx
+	ret
+ 
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x8_sse2
+align 16
+ WelsSampleSatd8x8_sse2:
+	 push   ebx
+	 mov    eax,    [esp+8]
+	 mov    ebx,    [esp+12]
+	 mov    ecx,    [esp+16]
+	 mov    edx,    [esp+20]    
+	 pxor   xmm6,   xmm6
+     pxor   xmm7,   xmm7     
+     SSE2_GetSatd8x8	 
+     psrlw   xmm6,  1
+	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	 movd    eax,   xmm6
+	 pop     ebx
+	 ret
+ 
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x16_sse2
+align 16
+ WelsSampleSatd8x16_sse2:
+	 push   ebx
+	 mov    eax,    [esp+8]
+	 mov    ebx,    [esp+12]
+	 mov    ecx,    [esp+16]
+	 mov    edx,    [esp+20]    
+	 pxor   xmm6,   xmm6
+     pxor   xmm7,   xmm7  
+        
+	 SSE2_GetSatd8x8	 
+     lea    eax,    [eax+2*ebx]
+     lea    ecx,    [ecx+2*edx]     
+	 SSE2_GetSatd8x8	
+	  
+	 psrlw   xmm6,  1
+	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	 movd    eax,   xmm6
+	 pop     ebx
+	 ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+align 16
+WelsSampleSatd16x8_sse2:
+	push   ebx
+	mov    eax,    [esp+8]
+	mov    ebx,    [esp+12]
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]    
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+    
+	SSE2_GetSatd8x8
+	mov    eax,    [esp+8]
+    mov    ecx,    [esp+16]
+    add    eax,    8
+    add    ecx,    8    
+	SSE2_GetSatd8x8
+
+	psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    eax,   xmm6
+	pop     ebx
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+align 16
+WelsSampleSatd16x16_sse2:
+	push   ebx
+	mov    eax,    [esp+8]
+	mov    ebx,    [esp+12]
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]    
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+    
+	SSE2_GetSatd8x8		
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]	
+	SSE2_GetSatd8x8
+	
+	mov    eax,    [esp+8]
+	mov    ecx,    [esp+16]
+	add    eax,    8
+	add    ecx,    8
+	
+	SSE2_GetSatd8x8	
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]	
+	SSE2_GetSatd8x8
+	
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    eax,   xmm6
+	pop     ebx
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN 
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2 
+	punpckhdq    %2, %3 
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+	paddd        xmm4, %1 ;for dc
+	paddd        xmm4, %3 ;for dc
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2  
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2 
+	punpckhdq    %2, %3 
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+;    paddd        xmm4, %1 ;for dc
+;	 paddd        xmm4, %3 ;for dc
+	movdqa       %4, %1
+	punpcklqdq   %4, %3 
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+	pxor        xmm7,   xmm7
+	movq        xmm0,   [eax]
+	movq        xmm1,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	movq        xmm2,   [eax]
+	movq        xmm3,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	punpcklbw   xmm0,   xmm7
+	punpcklbw   xmm1,   xmm7
+	punpcklbw   xmm2,   xmm7
+	punpcklbw   xmm3,   xmm7
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+	;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2],   0
+	pinsrw      xmm0,   word[esi+%2+8], 4
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0 
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+2],  0
+	pinsrw      xmm0,   word[esi+%2+10], 4
+	psubsw      xmm0,   xmm1
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0 
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+4],  0
+	pinsrw      xmm0,   word[esi+%2+12], 4
+	psubsw      xmm0,   xmm3
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0 
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+6],  0
+	pinsrw      xmm0,   word[esi+%2+14], 4
+	psubsw      xmm0,   xmm2
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0 
+%endmacro
+%macro SSE41_GetX38x4SatdH  3
+	movq        xmm0,   [esi+%3+8*%1]
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm5,   xmm0
+	pabsw       xmm1,   xmm1
+	pabsw       xmm2,   xmm2
+	pabsw       xmm3,   xmm3
+	paddw       xmm2,   xmm1;for DC
+	paddw       xmm2,   xmm3;for DC
+	paddw       xmm5,   xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+	pxor        xmm0,   xmm0
+	movq2dq     xmm0,   mm4
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2 
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+	shl         %1,     4
+	movdqa      xmm0,   [esi+32+%1]
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2 
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 32
+	SSE41_I16X16GetX38x4SatdDC 
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 16
+	SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+	pmaddwd     %1, %2 
+	movhlps     %3, %1 
+	paddd       %1, %3 
+	pshuflw     %3, %1,0Eh 
+	paddd       %1, %3 
+%endmacro
+
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+WelsIntra16x16Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]    
+	mov    esi,    [esp+40] ;temp_satd
+	pxor        xmm4,   xmm4
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movdqu 		xmm0,   [ecx]
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi],  xmm0 ;V
+	movdqa      [esi+16], xmm1 
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]  
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx] 
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx] 
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	lea         ecx,    [ecx+2*edx]  
+	pinsrb      xmm0,   byte[ecx-1],     8
+	pinsrb      xmm0,   byte[ecx+edx-1], 9
+	lea         ecx,    [ecx+2*edx] 
+	pinsrb      xmm0,   byte[ecx-1],     10
+	pinsrb      xmm0,   byte[ecx+edx-1], 11
+	lea         ecx,    [ecx+2*edx] 
+	pinsrb      xmm0,   byte[ecx-1],     12
+	pinsrb      xmm0,   byte[ecx+edx-1], 13
+	lea         ecx,    [ecx+2*edx] 
+	pinsrb      xmm0,   byte[ecx-1],     14
+	pinsrb      xmm0,   byte[ecx+edx-1], 15
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi+32], xmm0 ;H
+	movdqa      [esi+48], xmm1
+	movd        ecx,    xmm4 ;dc
+	add         ecx,    16   ;(sum+16)
+	shr         ecx,    5    ;((sum+16)>>5)
+	shl         ecx,    4    ;
+	movd        mm4,    ecx  ; mm4 copy DC
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+	mov         edi,    0
+.loop16x16_get_satd:    
+.loopStart1:
+	SSE41_I16x16GetX38x4Satd ecx, edi
+	inc          ecx
+	cmp         ecx, 4
+	jl          .loopStart1
+	cmp         edi, 16
+	je          .loop16x16_get_satd_end
+	mov         eax, [esp+24]
+	add         eax, 8
+	mov         ecx, 0
+	add         edi, 16
+	jmp         .loop16x16_get_satd
+ .loop16x16_get_satd_end:   
+	MMX_DW_1_2REG    xmm0, xmm1 
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+	
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov      edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx 
+	add       ebx, edx 
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_16x16
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16
+	
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx 
+	jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+	; for H mode 
+	cmp       edi, ecx
+	jge near   not_dc_h_16x16
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, edi 
+	jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+return_satd_intra_16x16_x3: 
+	WELSEMMS
+	pop         edi 
+	pop         esi 
+	pop         ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movq 		xmm0,   [ecx]
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+	movdqa      [esi],  xmm0 ;V
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]  
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx] 
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx] 
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+	movdqa      [esi+16], xmm0 ;H
+;(sum+2)>>2
+	movdqa      xmm6,   [PDQ2]
+	movdqa      xmm5,   xmm4
+	punpckhqdq  xmm5,   xmm1    
+	paddd       xmm5,   xmm6
+	psrld       xmm5,   2
+;(sum1+sum2+4)>>3   
+	paddd       xmm6,   xmm6
+	paddd       xmm4,   xmm1
+	paddd       xmm4,   xmm6
+	psrld       xmm4,   3
+;satd *16
+	pslld       xmm5,   4
+	pslld       xmm4,   4    
+;temp satd    
+	movdqa      xmm6,   xmm4
+	punpcklqdq  xmm4,   xmm5
+	psllq       xmm4,   32
+	psrlq       xmm4,   32
+	movdqa      [esi+32], xmm4
+	punpckhqdq  xmm5,   xmm6
+	psllq       xmm5,   32
+	psrlq       xmm5,   32
+	movdqa      [esi+48], xmm5
+	
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+loop_chroma_satdx3_cb_cr:    
+	SSE41_ChromaGetX38x4Satd ecx, 0
+	inc             ecx
+	cmp             ecx, 2
+	jl              loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+	movdq2q     %2, %1 
+	movhlps     %1, %1 
+	movdq2q     %3, %1 
+%endmacro
+%macro MMXReg2SSE 4
+	movq2dq     %1, %3 
+	movq2dq     %2, %4 
+	punpcklqdq  %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+WelsIntraChroma8x8Combined3Satd_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]    
+	mov    esi,    [esp+40] ;temp_satd
+	xor    edi,    edi
+loop_chroma_satdx3: 
+	SSE41_ChromaGetX38x8Satd
+	cmp             edi, 1
+	je              loop_chroma_satdx3end
+	inc             edi
+	SSEReg2MMX  xmm4, mm0,mm1
+	SSEReg2MMX  xmm5, mm2,mm3
+	SSEReg2MMX  xmm6, mm5,mm6
+	mov         ecx,  [esp+44]
+	mov         eax,  [esp+48]
+	jmp         loop_chroma_satdx3
+loop_chroma_satdx3end:    
+	MMXReg2SSE  xmm0, xmm3, mm0, mm1
+	MMXReg2SSE  xmm1, xmm3, mm2, mm3
+	MMXReg2SSE  xmm2, xmm3, mm5, mm6
+	
+	paddw       xmm4, xmm0
+	paddw       xmm5, xmm1
+	paddw       xmm6, xmm2
+	
+	MMX_DW_1_2REG    xmm0, xmm1 
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov       edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ecx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_8x8
+	cmp        ebx, ecx
+	jge near   not_dc_h_8x8
+	
+	; for DC mode
+	mov       dword[edx], 0;I8_PRED_DC
+	mov       eax, ebx 
+	jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+	; for H mode 
+	cmp       edi, ecx
+	jge near   not_dc_h_8x8
+	mov       dword[edx], 1;I8_PRED_H
+	mov       eax, edi 
+	jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+	; for V mode
+	mov       dword[edx], 2;I8_PRED_V
+	mov       eax, ecx
+return_satd_intra_8x8_x3: 
+	WELSEMMS
+	pop         edi 
+	pop         esi 
+	pop         ebx
+ret
+
+	
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END 
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+  movd        xmm6,%1 
+  pshufb      xmm6,xmm1 
+  movdqa      %1,  xmm6
+  movdqa      xmm0,%2 
+  psadbw      xmm0,xmm7 
+  paddw       xmm4,xmm0 
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5 
+  paddw       xmm2,xmm0 
+  psadbw      xmm6,%2
+  paddw       xmm3,xmm6 
+%endmacro
+%macro WelsAddDCValue 4
+    movzx   %2, byte %1
+    mov    %3, %2 
+    add     %4, %2
+%endmacro   
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN 
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+WelsIntra16x16Combined3Sad_ssse3:
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    edi,    [esp+40] ;temp_sad
+	sub    ecx,    edx
+    movdqa      xmm5,[ecx] 
+    pxor        xmm0,xmm0
+    psadbw      xmm0,xmm5 
+    movhlps     xmm1,xmm0 
+    paddw       xmm0,xmm1 
+    movd        eax,xmm0
+     
+    add         ecx,edx 
+    lea         ebx, [edx+2*edx]
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    sub        edi, 192
+    add         eax,10h 
+    shr         eax,5 
+    movd        xmm7,eax 
+    pxor        xmm1,xmm1
+    pshufb      xmm7,xmm1
+    pxor        xmm4,xmm4 
+    pxor        xmm3,xmm3 
+    pxor        xmm2,xmm2 
+;sad begin  
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]    
+    lea         esi, [ebx+2*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64  
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64  
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64  
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    
+    pslldq      xmm3,4 
+    por         xmm3,xmm2 
+    movhlps     xmm1,xmm3 
+    paddw       xmm3,xmm1 
+    movhlps     xmm0,xmm4
+    paddw       xmm4,xmm0
+; comparing order: DC H V
+	movd        ebx, xmm4 ;DC
+	movd        ecx, xmm3 ;V
+	psrldq      xmm3, 4
+	movd        esi, xmm3 ;H
+	mov         eax, [esp+36] ;lamda
+	shl         eax, 1
+	add         esi, eax 
+	add         ebx, eax 
+	mov         edx, [esp+32]
+	cmp         ebx, esi
+	jge near   not_dc_16x16_sad
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16_sad
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx 
+    sub        edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm7
+%assign x x+1
+%endrep
+	jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+	; for H mode 
+	cmp       esi, ecx
+	jge near   not_dc_h_16x16_sad
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, esi 
+	jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+    sub       edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+	pop    edi
+	pop    esi
+	pop    ebx
+	ret
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END 
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN 
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+	movq             xmm0, [eax]
+	punpcklqdq       xmm0, xmm0
+	pmaddubsw        xmm0, xmm7
+	movq             xmm1, [eax+ebx]
+	punpcklqdq       xmm1, xmm1
+	pmaddubsw        xmm1, xmm7
+	movq             xmm2, [ecx]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [ecx+edx]	
+	punpcklqdq       xmm3, xmm3	
+	pmaddubsw        xmm3, xmm7	
+	psubsw           xmm0, xmm2
+	psubsw           xmm1, xmm3
+	movq             xmm2, [eax+2*ebx]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [eax+esi]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	movq             xmm4, [ecx+2*edx]
+	punpcklqdq       xmm4, xmm4
+	pmaddubsw        xmm4, xmm7
+	movq             xmm5, [ecx+edi]	
+	punpcklqdq       xmm5, xmm5	
+	pmaddubsw        xmm5, xmm7
+	psubsw           xmm2, xmm4
+	psubsw           xmm3, xmm5
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4	
+	pabsw            xmm0, xmm0
+	pabsw            xmm2, xmm2
+	pabsw            xmm1, xmm1
+	pabsw            xmm3, xmm3
+	movdqa           xmm4, xmm3
+	pblendw          xmm3, xmm1, 0xAA
+	pslld            xmm1, 16
+	psrld            xmm4, 16
+	por              xmm1, xmm4
+	pmaxuw           xmm1, xmm3
+	paddw            xmm6, xmm1
+	movdqa           xmm4, xmm0
+	pblendw          xmm0, xmm2, 0xAA
+	pslld            xmm2, 16
+	psrld            xmm4, 16
+	por              xmm2, xmm4
+	pmaxuw           xmm0, xmm2	
+	paddw            xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+	MMX_DW_1_2REG    %3, %4 
+	pmaddwd     %2, %3 
+	movhlps     %4, %2 
+	paddd       %2, %4 
+	pshuflw     %4, %2,0Eh 
+	paddd       %2, %4 
+	movd		%1, %2 
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+WelsSampleSatd4x4_sse41:
+	push        ebx  
+	mov         eax,[esp+8] 
+	mov         ebx,[esp+12] 
+	mov         ecx,[esp+16] 
+	mov         edx,[esp+20] 
+	movdqa      xmm4,[HSwapSumSubDB1] 
+	movd        xmm2,[ecx] 
+	movd        xmm5,[ecx+edx] 
+	shufps      xmm2,xmm5,0 
+	movd        xmm3,[ecx+edx*2] 
+	lea         ecx, [edx*2+ecx]
+	movd        xmm5,[ecx+edx] 
+	shufps      xmm3,xmm5,0 
+	movd        xmm0,[eax] 
+	movd        xmm5,[eax+ebx] 
+	shufps      xmm0,xmm5,0 
+	movd        xmm1,[eax+ebx*2] 
+	lea         eax, [ebx*2+eax]
+	movd        xmm5,[eax+ebx] 
+	shufps      xmm1,xmm5,0 
+	pmaddubsw   xmm0,xmm4 
+	pmaddubsw   xmm1,xmm4 
+	pmaddubsw   xmm2,xmm4 
+	pmaddubsw   xmm3,xmm4 
+	psubw       xmm0,xmm2 
+	psubw       xmm1,xmm3 
+	movdqa      xmm2,xmm0 
+	paddw       xmm0,xmm1 
+	psubw       xmm1,xmm2 
+	movdqa      xmm2,xmm0 
+	punpcklqdq  xmm0,xmm1 
+	punpckhqdq  xmm2,xmm1 
+	movdqa      xmm1,xmm0 
+	paddw       xmm0,xmm2 
+	psubw       xmm2,xmm1 
+	movdqa      xmm1,xmm0 
+	pblendw     xmm0,xmm2,0AAh 
+	pslld       xmm2,16 
+	psrld       xmm1,16 
+	por         xmm2,xmm1 
+	pabsw       xmm0,xmm0 
+	pabsw       xmm2,xmm2 
+	pmaxsw      xmm0,xmm2 
+	SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
+	pop         ebx  
+	ret 
+ 
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+align 16
+WelsSampleSatd8x8_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    eax,    [esp+16]
+	mov    ebx,    [esp+20]
+	mov    ecx,    [esp+24]
+	mov    edx,    [esp+28]    
+	movdqa      xmm7, [HSumSubDB1]
+	lea         esi,  [ebx+ebx*2] 
+	lea         edi,  [edx+edx*2] 
+	pxor		xmm6, xmm6
+	SSE41_GetSatd8x4
+	lea			eax,	[eax+4*ebx]
+	lea			ecx,    [ecx+4*edx]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
+	pop 		edi
+	pop 		esi
+	pop 		ebx
+	ret
+ 
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+align 16
+WelsSampleSatd8x16_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	push   ebp
+%define pushsize   16	
+	mov    eax,    [esp+pushsize+4]
+	mov    ebx,    [esp+pushsize+8]
+	mov    ecx,    [esp+pushsize+12]
+	mov    edx,    [esp+pushsize+16]    
+	movdqa      xmm7, [HSumSubDB1]
+	lea         esi,  [ebx+ebx*2] 
+	lea         edi,  [edx+edx*2] 
+	pxor        xmm6, xmm6
+	mov         ebp,    0
+loop_get_satd_8x16:	
+	SSE41_GetSatd8x4
+	lea			eax,  [eax+4*ebx]
+	lea			ecx,  [ecx+4*edx]
+	inc         ebp
+	cmp         ebp,  4
+	jl          loop_get_satd_8x16
+	SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
+	pop         ebp
+	pop 		edi
+	pop 		esi
+	pop 		ebx
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+align 16
+WelsSampleSatd16x8_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	mov    eax,    [esp+16]
+	mov    ebx,    [esp+20]
+	mov    ecx,    [esp+24]
+	mov    edx,    [esp+28]    
+	movdqa      xmm7, [HSumSubDB1]
+	lea         esi,  [ebx+ebx*2] 
+	lea         edi,  [edx+edx*2] 
+	pxor		xmm6,   xmm6
+	SSE41_GetSatd8x4
+	lea			eax,  [eax+4*ebx]
+	lea			ecx,  [ecx+4*edx]
+	SSE41_GetSatd8x4
+	mov			eax,    [esp+16]
+	mov			ecx,    [esp+24]
+	add			eax,    8
+	add			ecx,    8
+	SSE41_GetSatd8x4
+	lea			eax,    [eax+4*ebx]
+	lea			ecx,    [ecx+4*edx]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
+	pop 		edi
+	pop 		esi
+	pop 		ebx
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+   
+WELS_EXTERN WelsSampleSatd16x16_sse41
+align 16
+WelsSampleSatd16x16_sse41:
+	push   ebx
+	push   esi
+	push   edi
+	push   ebp
+	%define pushsize   16	
+	mov    eax,    [esp+pushsize+4]
+	mov    ebx,    [esp+pushsize+8]
+	mov    ecx,    [esp+pushsize+12]
+	mov    edx,    [esp+pushsize+16]    
+	movdqa      xmm7, [HSumSubDB1]
+	lea         esi,  [ebx+ebx*2] 
+	lea         edi,  [edx+edx*2] 
+	pxor		xmm6,   xmm6
+	mov         ebp,    0
+loop_get_satd_16x16_left:	
+	SSE41_GetSatd8x4
+	lea			eax,  [eax+4*ebx]
+	lea			ecx,  [ecx+4*edx]
+	inc         ebp
+	cmp         ebp,  4
+	jl          loop_get_satd_16x16_left
+	mov			eax,    [esp+pushsize+4]
+	mov			ecx,    [esp+pushsize+12]
+	add			eax,    8
+	add			ecx,    8
+	mov         ebp,    0
+loop_get_satd_16x16_right:
+	SSE41_GetSatd8x4
+	lea			eax,  [eax+4*ebx]
+	lea			ecx,  [ecx+4*edx]
+	inc         ebp
+	cmp         ebp,  4
+	jl          loop_get_satd_16x16_right
+	SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
+	%undef pushsize
+	pop         ebp
+	pop 		edi
+	pop 		esi
+	pop 		ebx
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	movdqu xmm1,   [ecx]
+	MOVDQ  xmm2,   [eax];[eax] must aligned 16
+	psadbw xmm1,   xmm2 
+	paddw  xmm0,   xmm1	
+	movdqu xmm1,   [ecx+edx]
+	MOVDQ  xmm2,   [eax+ebx]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+	movdqu xmm0,   [ecx]
+	MOVDQ  xmm2,   [eax]
+	psadbw xmm0,   xmm2 
+	paddw  xmm7,   xmm0
+	movdqu xmm1,   [ecx+edx]
+	MOVDQ  xmm2,   [eax+ebx]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [ecx+2*edx]
+	MOVDQ  xmm2,   [eax+2*ebx];[eax] must aligned 16
+	psadbw xmm1,   xmm2 
+	paddw  xmm7,   xmm1	
+	movdqu xmm1,   [ecx+edi]
+	MOVDQ  xmm2,   [eax+esi]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+	movq   xmm0,   [eax]
+	movq   xmm1,   [eax+ebx]
+	lea    eax,    [eax+2*ebx]
+	movhps xmm0,   [eax]
+	movhps xmm1,   [eax+ebx]
+
+	movq   xmm2,   [ecx]
+	movq   xmm3,   [ecx+edx]
+	lea    ecx,    [ecx+2*edx]
+	movhps xmm2,   [ecx]
+	movhps xmm3,   [ecx+edx]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+align 16
+WelsSampleSad16x16_sse2:
+	push ebx
+	push edi
+	push esi	
+		
+	%define _STACK_SIZE		12
+	
+	mov eax, [esp+_STACK_SIZE+4 ]
+	mov	ebx, [esp+_STACK_SIZE+8 ]
+	lea esi, [3*ebx]
+	mov ecx, [esp+_STACK_SIZE+12]
+	mov edx, [esp+_STACK_SIZE+16]	
+	lea edi, [3*edx]	
+	
+	pxor   xmm7,   xmm7
+	SSE2_GetSad4x16
+	lea   eax,    [eax+4*ebx]
+	lea   ecx,    [ecx+4*edx]
+	SSE2_GetSad4x16
+	lea   eax,    [eax+4*ebx]
+	lea   ecx,    [ecx+4*edx]
+	SSE2_GetSad4x16
+	lea   eax,    [eax+4*ebx]
+	lea   ecx,    [ecx+4*edx]
+	SSE2_GetSad4x16
+	movhlps xmm0, xmm7
+	paddw xmm0, xmm7
+	movd eax, xmm0
+	
+	%undef _STACK_SIZE	
+	
+	pop esi
+	pop edi
+	pop ebx
+	ret
+   
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+align 16
+WelsSampleSad16x8_sse2:
+	push   ebx
+	mov    eax,    [esp+8]
+	mov    ebx,    [esp+12]
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]    
+	movdqu xmm0,   [ecx]
+	MOVDQ  xmm2,   [eax]
+	psadbw xmm0,   xmm2 
+	movdqu xmm1,   [ecx+edx]
+	MOVDQ  xmm2,   [eax+ebx]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+
+	movhlps     xmm1, xmm0
+	paddw       xmm0, xmm1
+	movd        eax,  xmm0
+	pop         ebx
+	ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+WelsSampleSad8x16_sse2:
+	push   ebx
+	mov    eax,    [esp+8]
+	mov    ebx,    [esp+12]
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]    
+    pxor   xmm6,   xmm6
+	
+	SSE2_GetSad8x4
+    lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+    SSE2_GetSad8x4    
+    lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	SSE2_GetSad8x4
+    lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+    SSE2_GetSad8x4    
+
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       eax,  xmm0
+	pop        ebx
+	ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+    mov    ecx,    [esp+12]
+	mov    edx,    ecx
+    CACHE_SPLIT_CHECK edx, 8, 64
+	jle    near   .pixel_sad_8x8_nsplit
+	push   ebx
+	push   edi
+	mov    eax,    [esp+12]
+	mov    ebx,    [esp+16]
+    
+    pxor   xmm7,   xmm7
+    
+    mov    edi,    ecx
+    and    edi,    0x07
+    sub    ecx,    edi   
+    mov    edx,    8
+    sub    edx,    edi
+    
+    shl    edi,    3
+    shl    edx,    3
+    movd   xmm5,   edi
+    movd   xmm6,   edx
+	mov    edi,    8
+	add    edi,    ecx
+    mov    edx,    [esp+24]
+    
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+		
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+	
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+	
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	lea    edi,    [edi+2*edx]
+	 
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+		
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+	
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	lea    edi,    [edi+2*edx]
+	 
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+		
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+	
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+	
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	lea    edi,    [edi+2*edx]
+	 
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+		
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+	
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+	
+    movhlps    xmm0, xmm7
+	paddw      xmm0, xmm7
+	movd       eax,  xmm0
+	pop        edi
+	jmp        .return
+.pixel_sad_8x8_nsplit:
+    push   ebx
+    mov    eax,    [esp+8]
+	mov    ebx,    [esp+12]
+	mov    edx,    [esp+20]    
+	pxor   xmm6,   xmm6
+	SSE2_GetSad8x4
+    lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+    SSE2_GetSad8x4    
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       eax,  xmm0
+.return:
+	pop        ebx
+	ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END 
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN 
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+	psadbw %1,   %4
+	paddw  xmm5, %1
+	psadbw %4,   %3
+	paddw  xmm4, %4
+	movdqu %4,   [%5-1]
+	psadbw %4,   %2
+	paddw  xmm6, %4
+	movdqu %4,   [%5+1]
+	psadbw %4,   %2
+	paddw  xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+WelsSampleSadFour16x16_sse2:
+	push ebx
+	mov    eax,    [esp+8]
+	mov    ebx,    [esp+12]
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [eax]
+	sub    ecx,    edx
+	movdqu xmm3,   [ecx]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+	
+	movdqa xmm1,   [eax+ebx]
+	movdqu xmm3,   [ecx+edx]
+	psadbw xmm3,   xmm1
+	paddw  xmm4,   xmm3
+	
+	movdqu xmm2,   [ecx+edx-1]
+	psadbw xmm2,   xmm0
+	paddw  xmm6,   xmm2
+	
+	movdqu xmm3,   [ecx+edx+1]
+	psadbw xmm3,   xmm0
+	paddw  xmm7,   xmm3
+	
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	movdqa xmm2,   [eax]
+	movdqu xmm3,   [ecx]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
+	movdqa xmm0,   [eax+ebx]
+	movdqu xmm3,   [ecx+edx]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	movdqa xmm1,   [eax]
+	movdqu xmm3,   [ecx]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx
+	movdqa xmm2,   [eax+ebx]
+	movdqu xmm3,   [ecx+edx]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	movdqa xmm0,   [eax]
+	movdqu xmm3,   [ecx]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx
+	movdqa xmm1,   [eax+ebx]
+	movdqu xmm3,   [ecx+edx]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	movdqa xmm2,   [eax]
+	movdqu xmm3,   [ecx]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
+	movdqa xmm0,   [eax+ebx]
+	movdqu xmm3,   [ecx+edx]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	movdqa xmm1,   [eax]
+	movdqu xmm3,   [ecx]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx
+	movdqa xmm2,   [eax+ebx]
+	movdqu xmm3,   [ecx+edx]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	movdqa xmm0,   [eax]
+	movdqu xmm3,   [ecx]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx
+	movdqa xmm1,   [eax+ebx]
+	movdqu xmm3,   [ecx+edx]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	movdqa xmm2,   [eax]
+	movdqu xmm3,   [ecx]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
+	movdqa xmm0,   [eax+ebx]
+	movdqu xmm3,   [ecx+edx]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
+	lea    ecx,    [ecx+2*edx]
+	movdqu xmm3,   [ecx]
+	psadbw xmm2,   xmm3
+	paddw xmm5,   xmm2
+	
+	movdqu xmm2,   [ecx-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+	
+	movdqu xmm3,   [ecx+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+	
+	movdqu xmm3,   [ecx+edx]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+	
+	mov        ecx,  [esp+24]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0 
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0 
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0 
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7 
+	punpcklqdq xmm4, xmm6
+	movdqa     [ecx],xmm4
+	pop  ebx
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+WelsSampleSadFour16x8_sse2:
+	push ebx
+	push edi
+	mov    eax,    [esp+12]
+	mov    ebx,    [esp+16]
+	mov    edi,    [esp+20]
+	mov    edx,    [esp+24]
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [eax]
+	sub    edi,    edx
+	movdqu xmm3,   [edi]
+	psadbw xmm3,   xmm0
+	paddw xmm4,   xmm3
+	
+	movdqa xmm1,   [eax+ebx]
+	movdqu xmm3,   [edi+edx]
+	psadbw xmm3,   xmm1
+	paddw xmm4,   xmm3
+	
+	movdqu xmm2,   [edi+edx-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+	
+	movdqu xmm3,   [edi+edx+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+	
+	lea    eax,    [eax+2*ebx]
+	lea    edi,    [edi+2*edx]
+	movdqa xmm2,   [eax]
+	movdqu xmm3,   [edi]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi
+	movdqa xmm0,   [eax+ebx]
+	movdqu xmm3,   [edi+edx]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi+edx
+	lea    eax,    [eax+2*ebx]
+	lea    edi,    [edi+2*edx]
+	movdqa xmm1,   [eax]
+	movdqu xmm3,   [edi]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi
+	movdqa xmm2,   [eax+ebx]
+	movdqu xmm3,   [edi+edx]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi+edx
+	lea    eax,    [eax+2*ebx]
+	lea    edi,    [edi+2*edx]
+	movdqa xmm0,   [eax]
+	movdqu xmm3,   [edi]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi
+	movdqa xmm1,   [eax+ebx]
+	movdqu xmm3,   [edi+edx]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi+edx
+	lea    edi,    [edi+2*edx]
+	movdqu xmm3,   [edi]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+	
+	movdqu xmm0,   [edi-1]
+	psadbw xmm0,   xmm1
+	paddw xmm6,   xmm0
+	
+	movdqu xmm3,   [edi+1]
+	psadbw xmm3,   xmm1
+	paddw xmm7,   xmm3
+	
+	movdqu xmm3,   [edi+edx]
+	psadbw xmm1,   xmm3
+	paddw xmm5,   xmm1
+	
+	mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0 
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0 
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0 
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7 
+	punpcklqdq xmm4, xmm6
+	movdqa     [edi],xmm4
+	pop  edi
+	pop  ebx
+	ret
+	
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+WelsSampleSadFour8x16_sse2:
+	push ebx
+	push edi
+	mov    eax,    [esp+12]
+	mov    ebx,    [esp+16]
+	mov    edi,    [esp+20]
+	mov    edx,    [esp+24]
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+	sub    edi,    edx
+	movq   xmm3,   [edi]
+	movhps xmm3,   [edi+edx]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0 
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0 
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0 
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7 
+	punpcklqdq xmm4, xmm6
+	movdqa     [edi],xmm4
+	pop  edi
+	pop  ebx
+	ret
+	
+	
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+WelsSampleSadFour8x8_sse2:
+	push ebx
+	push edi
+	mov    eax,    [esp+12]
+	mov    ebx,    [esp+16]
+	mov    edi,    [esp+20]
+	mov    edx,    [esp+24]
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+	sub    edi,    edx
+	movq   xmm3,   [edi]
+	movhps xmm3,   [edi+edx]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	movq   xmm0,  [eax]
+	movhps xmm0,  [eax+ebx]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+	
+	
+	movq   xmm1,  [edi+edx-1]
+	movq   xmm3,  [edi+edx+1]
+	
+	lea    eax,   [eax+2*ebx]
+	lea    edi,   [edi+2*edx]
+	movhps xmm1,  [edi-1]
+	movhps xmm3,  [edi+1]
+	
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+	
+	movq   xmm3,  [edi]
+	movhps xmm3,  [edi+edx]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+	
+	mov        edi,  [esp+28]
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0 
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0 
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0 
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7 
+	punpcklqdq xmm4, xmm6
+	movdqa     [edi],xmm4
+	pop  edi
+	pop  ebx
+	ret
+	
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+WelsSampleSadFour4x4_sse2:
+	push ebx
+	push edi
+	mov    eax,    [esp+12]
+	mov    ebx,    [esp+16]
+	mov    edi,    [esp+20]
+	mov    edx,    [esp+24]
+	movd   xmm0,   [eax]
+	movd   xmm1,   [eax+ebx]
+	lea        eax,    [eax+2*ebx]
+	movd       xmm2,   [eax]
+	movd       xmm3,   [eax+ebx]
+	punpckldq  xmm0, xmm1
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm0, xmm2
+	sub        edi,  edx
+	movd       xmm1, [edi]
+	movd       xmm2, [edi+edx]
+	punpckldq  xmm1, xmm2
+	movd       xmm2, [edi+edx-1]
+	movd       xmm3, [edi+edx+1]
+	
+	lea        edi,  [edi+2*edx]
+	
+	movd       xmm4, [edi]
+	movd       xmm5, [edi-1]
+	punpckldq  xmm2, xmm5
+	movd       xmm5, [edi+1]
+	punpckldq  xmm3, xmm5
+	
+	movd       xmm5, [edi+edx]
+	punpckldq  xmm4, xmm5
+	
+	punpcklqdq xmm1, xmm4 ;-L
+	
+	movd       xmm5, [edi+edx-1]
+	movd       xmm6, [edi+edx+1]
+	
+	lea        edi,  [edi+2*edx]
+	movd       xmm7, [edi-1]
+	punpckldq  xmm5, xmm7
+	punpcklqdq xmm2, xmm5 ;-1
+	movd       xmm7, [edi+1]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm3, xmm6 ;+1
+	movd       xmm6, [edi]
+	movd       xmm7, [edi+edx]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6 ;+L 
+	psadbw     xmm1, xmm0
+	psadbw     xmm2, xmm0
+	psadbw     xmm3, xmm0
+	psadbw     xmm4, xmm0
+	
+	movhlps    xmm0, xmm1
+	paddw      xmm1, xmm0
+	movhlps    xmm0, xmm2
+	paddw      xmm2, xmm0
+	movhlps    xmm0, xmm3
+	paddw      xmm3, xmm0
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	mov        edi,  [esp+28]
+	punpckldq  xmm1, xmm4
+	punpckldq  xmm2, xmm3 
+	punpcklqdq xmm1, xmm2
+	movdqa     [edi],xmm1
+	pop  edi
+	pop  ebx
+	ret
+	
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSad4x4_mmx
+
+align 16
+;***********************************************************************
+;   int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WelsSampleSad4x4_mmx:
+    push    ebx
+%define pushsize     4
+%define pix1address	 esp+pushsize+4
+%define pix1stride   esp+pushsize+8
+%define pix2address  esp+pushsize+12
+%define pix2stride   esp+pushsize+16
+
+    mov		  eax, [pix1address]    
+    mov		  ebx, [pix1stride ]    
+    mov		  ecx, [pix2address]    
+    mov		  edx, [pix2stride ]    
+
+	movd	  mm0, [eax]
+	movd	  mm1, [eax+ebx]
+	punpckldq mm0, mm1
+	
+	movd      mm3, [ecx]
+	movd      mm4, [ecx+edx]
+	punpckldq mm3, mm4
+	psadbw    mm0, mm3
+	
+	lea       eax, [eax+2*ebx]
+	lea       ecx, [ecx+2*edx]
+	
+	movd      mm1, [eax]
+	movd      mm2, [eax+ebx]
+	punpckldq mm1, mm2
+	
+	movd      mm3, [ecx]
+	movd      mm4, [ecx+edx]
+	punpckldq mm3, mm4
+	psadbw    mm1, mm3
+	paddw     mm0, mm1
+	
+    movd      eax, mm0
+
+	WELSEMMS
+    pop ebx
+%undef pushsize     
+%undef pix1address	
+%undef pix1stride   
+%undef pix2address  
+%undef pix2stride   
+    ret
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/asm/score.asm
@@ -1,0 +1,324 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  score.asm
+;*
+;*  Abstract
+;*      scan/score/count of sse2
+;*
+;*  History
+;*      8/21/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+bits 32
+
+;***********************************************************************
+; Macros 
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+SECTION .rodata align=16
+
+;align 16
+;se2_2 dw 2, 2, 2, 2, 2, 2, 2, 2
+align 16
+sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
+align 16
+sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 
+align 16
+sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc: dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec: dw 8, 7, 6, 5, 4, 3, 2, 1
+align 16
+pb_scanacdc_maska:db 0,1,2,3,8,9,14,15,10,11,4,5,6,7,12,13
+align 16
+pb_scanacdc_maskb:db 2,3,8,9,10,11,4,5,0,1,6,7,12,13,14,15
+align 16
+pb_scandc_maska:db 2,3,8,9,14,15,10,11,4,5,6,7,12,13,0,1
+align 16
+pb_scandc_maskb:db 8,9,10,11,4,5,0,1,6,7,12,13,14,15,128,128
+
+align 16
+nozero_count_table:
+db  0,1,1,2,1,2,2,3,1,2
+db  2,3,2,3,3,4,1,2,2,3
+db  2,3,3,4,2,3,3,4,3,4
+db  4,5,1,2,2,3,2,3,3,4
+db  2,3,3,4,3,4,4,5,2,3
+db  3,4,3,4,4,5,3,4,4,5
+db  4,5,5,6,1,2,2,3,2,3
+db  3,4,2,3,3,4,3,4,4,5
+db  2,3,3,4,3,4,4,5,3,4
+db  4,5,4,5,5,6,2,3,3,4
+db  3,4,4,5,3,4,4,5,4,5
+db  5,6,3,4,4,5,4,5,5,6
+db  4,5,5,6,5,6,6,7,1,2
+db  2,3,2,3,3,4,2,3,3,4
+db  3,4,4,5,2,3,3,4,3,4
+db  4,5,3,4,4,5,4,5,5,6
+db  2,3,3,4,3,4,4,5,3,4
+db  4,5,4,5,5,6,3,4,4,5
+db  4,5,5,6,4,5,5,6,5,6
+db  6,7,2,3,3,4,3,4,4,5
+db  3,4,4,5,4,5,5,6,3,4
+db  4,5,4,5,5,6,4,5,5,6
+db  5,6,6,7,3,4,4,5,4,5
+db  5,6,4,5,5,6,5,6,6,7
+db  4,5,5,6,5,6,6,7,5,6
+db  6,7,6,7,7,8
+
+align 16
+high_mask_table:
+	db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
+	db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
+	db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
+	db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
+	db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
+	db  5, 8, 5, 7, 8,11, 6, 8, 8,11
+	db  9,11,12,15, 0, 1, 1, 4, 1, 3
+	db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
+	db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
+	db  7,10, 8,10,11,14, 3, 4, 4, 7
+	db  5, 7, 8,11, 5, 7, 7,10, 8,10
+	db 11,14, 6, 7, 8,11, 8,10,11,14
+	db  9,11,11,14,12,14,15,18, 0, 0
+	db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
+	db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
+	db  7,10, 5, 7, 7,10, 8,10,11,14
+	db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
+	db  6, 9, 7, 9,10,13, 5, 6, 7,10
+	db  7, 9,10,13, 8,10,10,13,11,13
+	db 14,17, 3, 4, 4, 7, 4, 6, 7,10
+	db  5, 7, 7,10, 8,10,11,14, 5, 6
+	db  7,10, 7, 9,10,13, 8,10,10,13
+	db 11,13,14,17, 6, 7, 7,10, 8,10
+	db 11,14, 8,10,10,13,11,13,14,17
+	db  9,10,11,14,11,13,14,17,12,14
+	db 14,17,15,17,18,21
+
+align 16
+low_mask_table:
+    db  0, 3, 2, 6, 2, 5, 5, 9, 1, 5
+    db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
+    db  4, 7, 7,11, 4, 8, 7,11, 8,11
+    db 11,15, 1, 4, 3, 7, 4, 7, 7,11
+    db  3, 7, 6,10, 7,10,10,14, 4, 7 
+    db  7,11, 7,10,10,14, 7,11,10,14
+    db 11,14,14,18, 0, 4, 3, 7, 3, 6
+    db  6,10, 3, 7, 6,10, 7,10,10,14
+    db  3, 6, 6,10, 6, 9, 9,13, 6,10
+    db  9,13,10,13,13,17, 4, 7, 6,10
+    db  7,10,10,14, 6,10, 9,13,10,13
+    db 13,17, 7,10,10,14,10,13,13,17
+    db 10,14,13,17,14,17,17,21, 0, 3
+    db  3, 7, 3, 6, 6,10, 2, 6, 5, 9
+    db  6, 9, 9,13, 3, 6, 6,10, 6, 9
+    db  9,13, 6,10, 9,13,10,13,13,17
+    db  3, 6, 5, 9, 6, 9, 9,13, 5, 9
+    db  8,12, 9,12,12,16, 6, 9, 9,13
+    db  9,12,12,16, 9,13,12,16,13,16
+    db 16,20, 3, 7, 6,10, 6, 9, 9,13
+    db  6,10, 9,13,10,13,13,17, 6, 9
+    db  9,13, 9,12,12,16, 9,13,12,16
+    db 13,16,16,20, 7,10, 9,13,10,13
+    db 13,17, 9,13,12,16,13,16,16,20
+    db 10,13,13,17,13,16,16,20,13,17
+    db 16,20,17,20,20,24
+
+
+SECTION .text
+
+;***********************************************************************
+;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
+;***********************************************************************
+ALIGN 16
+WELS_EXTERN WelsScan4x4DcAc_sse2
+WelsScan4x4DcAc_sse2:
+
+	mov        eax, [esp+8]
+	movdqa     xmm0, [eax]			; 7 6 5 4 3 2 1 0
+	movdqa     xmm1, [eax+16]		; f e d c b a 9 8
+	pextrw     ecx, xmm0, 7			; ecx = 7
+	pextrw     edx, xmm1, 2			; edx = a
+	pextrw     eax, xmm0, 5			; eax = 5
+	pinsrw     xmm1, ecx, 2			; f e d c b 7 9 8
+	pinsrw     xmm0, eax, 7			; 5 6 5 4 3 2 1 0
+	pextrw     ecx, xmm1, 0			; ecx = 8
+	pinsrw     xmm0, ecx, 5			; 5 6 8 4 3 2 1 0
+	pinsrw     xmm1, edx, 0			; f e d c b 7 9 a
+	pshufd     xmm2, xmm0, 0xd8		; 5 6 3 2 8 4 1 0
+	pshufd     xmm3, xmm1, 0xd8		; f e b 7 d c 9 a
+	pshufhw    xmm0, xmm2, 0x93		; 6 3 2 5 8 4 1 0
+	pshuflw    xmm1, xmm3, 0x39		; f e b 7 a d c 9
+	mov        eax,  [esp+4]
+	movdqa     [eax],xmm0
+	movdqa     [eax+16], xmm1
+	ret
+	
+;***********************************************************************
+;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
+;***********************************************************************
+ALIGN 16
+WELS_EXTERN WelsScan4x4DcAc_ssse3
+WelsScan4x4DcAc_ssse3:
+	mov        eax, [esp+8]
+	movdqa     xmm0, [eax]
+	movdqa     xmm1, [eax+16]
+	pextrw		ecx,  xmm0, 7			; ecx = [7]
+	pextrw		eax,  xmm1, 0			; eax = [8]
+	pinsrw		xmm0, eax, 7			; xmm0[7]	=	[8]
+	pinsrw		xmm1, ecx, 0			; xmm1[0]	=	[7]
+	pshufb		xmm1, [pb_scanacdc_maskb]
+	pshufb		xmm0, [pb_scanacdc_maska]	
+
+	mov        eax,  [esp+4]
+	movdqa     [eax],xmm0
+	movdqa     [eax+16], xmm1
+	ret
+;***********************************************************************
+;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
+;***********************************************************************
+ALIGN 16
+WELS_EXTERN WelsScan4x4Ac_sse2
+WelsScan4x4Ac_sse2:
+	mov        eax, [esp+8]
+	movdqa     xmm0, [eax]
+	movdqa     xmm1, [eax+16]
+	movdqa     xmm2, xmm0
+	punpcklqdq xmm0, xmm1
+	punpckhqdq xmm2, xmm1
+	
+	movdqa     xmm3, xmm0
+	punpckldq  xmm0, xmm2
+	punpckhdq  xmm3, xmm2
+	pextrw     eax , xmm0, 3
+	pextrw     edx , xmm0, 7
+	pinsrw     xmm0, eax,  7
+	pextrw     eax,  xmm3, 4
+	pinsrw     xmm3, edx,  4
+	pextrw     edx,  xmm3, 0
+	pinsrw     xmm3, eax,  0
+	pinsrw     xmm0, edx,  3
+	
+	pshufhw    xmm1, xmm0, 0x93
+	pshuflw    xmm2, xmm3, 0x39
+    
+    movdqa     xmm3, xmm2
+    psrldq     xmm1, 2
+    pslldq     xmm3, 14
+    por        xmm1, xmm3
+    psrldq     xmm2, 2
+	mov        eax,  [esp+4]
+	movdqa     [eax],xmm1
+	movdqa     [eax+16], xmm2
+	ret
+
+
+;***********************************************************************
+;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
+;***********************************************************************
+ALIGN 16
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 
+WelsCalculateSingleCtr4x4_sse2:
+	push      ebx
+	mov       eax,  [esp+8]
+	movdqa    xmm0, [eax]
+	movdqa    xmm1, [eax+16]
+	
+	packsswb  xmm0, xmm1
+
+    pxor      xmm3, xmm3
+    pcmpeqb   xmm0, xmm3
+    pmovmskb  edx,  xmm0
+
+    xor       edx,  0xffff
+
+	xor       eax,  eax
+	mov       ecx,  7
+	mov       ebx,  8
+.loop_low8_find1:
+	bt        edx,  ecx
+	jc        .loop_high8_find1
+	loop      .loop_low8_find1
+.loop_high8_find1:
+	bt        edx, ebx
+	jc        .find1end
+	inc       ebx
+	cmp       ebx,16
+	jb        .loop_high8_find1
+.find1end:
+	sub       ebx, ecx
+	sub       ebx, 1
+	add       al,  [i_ds_table+ebx]
+	mov       ebx, edx
+	and       edx, 0xff
+	shr       ebx, 8
+	and       ebx, 0xff
+	add       al,  [low_mask_table +edx]
+	add       al,  [high_mask_table+ebx]
+
+	pop       ebx
+	ret
+
+
+;***********************************************************************
+; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
+;***********************************************************************
+ALIGN 16
+WELS_EXTERN WelsGetNoneZeroCount_sse2
+WelsGetNoneZeroCount_sse2:
+	mov       eax,  [esp+4]
+	movdqa    xmm0, [eax]
+	movdqa    xmm1, [eax+16]
+	pxor      xmm2, xmm2
+	pcmpeqw   xmm0, xmm2
+	pcmpeqw   xmm1, xmm2
+	packsswb  xmm1, xmm0
+	pmovmskb  edx,  xmm1
+	xor       edx,  0xffff
+	mov       ecx,  edx
+	and       edx,  0xff
+	shr       ecx,  8
+;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
+	xor       eax,  eax	
+	add       al,  [nozero_count_table+ecx]
+	add       al,  [nozero_count_table+edx]
+	ret
+
--- /dev/null
+++ b/codec/encoder/core/asm/vaa.asm
@@ -1,0 +1,403 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	vaa.asm
+;*
+;*	Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010	Created
+;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
+;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
+;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
+	; @sum_8x2 begin
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 0B1h	; 10110001 B
+	paddw %1, %2
+	; end of @sum_8x2
+%endmacro	; END of SUM_WORD_8x2_SSE2
+
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	pshufd %3, %1, 0B1h
+	pshufd %4, %2, 0B1h
+	paddw %1, %3
+	paddw %2, %4
+	movdqa %3, %1
+	movdqa %4, %2
+	pshuflw %5, %1, 0B1h
+	pshufhw %6, %3, 0B1h
+	paddw %1, %5
+	paddw %3, %6
+	pshuflw %5, %2, 0B1h
+	pshufhw %6, %4, 0B1h
+	paddw %2, %5
+	paddw %4, %6
+	punpcklwd %1, %2
+	punpckhwd %3, %4
+	punpcklwd %1, %3
+	psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+	psraw %1, $4
+%endmacro
+
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+;	dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+; , 6/7/2010
+
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32	
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; iLineSize
+
+	mov ebx, ecx
+	sal ebx, $1			; iLineSize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; iLineSize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; iLineSize x 4 [eax]
+	
+	pxor xmm7, xmm7
+	
+	; loops
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+8], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+24], xmm0
+		
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3
+	
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low word truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+	
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32	
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; iLineSize
+
+	mov ebx, ecx
+	sal ebx, $1			; iLineSize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; iLineSize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; iLineSize x 4 [eax]
+	
+	pxor xmm7, xmm7
+	
+	; loops
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+8], xmm1	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+24], xmm1
+		
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low work truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+	
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+	
+WELS_EXTERN MdInterAnalysisVaaInfo_sse41
+;***********************************************************************
+;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+;***********************************************************************
+ALIGN 16
+MdInterAnalysisVaaInfo_sse41:
+	mov eax, [esp+4]
+	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	pshufd xmm1, xmm0, 01Bh
+	paddd xmm1, xmm0
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2	
+	psrad xmm1, 02h		; iAverageSad
+	movdqa xmm2, xmm1
+	psrad xmm2, 06h
+	movdqa xmm3, xmm0	; iSadBlock
+	psrad xmm3, 06h
+	psubd xmm3, xmm2
+	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets	
+	pshufd xmm4, xmm3, 01Bh
+	paddd xmm4, xmm3
+	pshufd xmm3, xmm4, 0B1h
+	paddd xmm3, xmm4
+	movd eax, xmm3
+	cmp eax, 20	; INTER_VARIANCE_SAD_THRESHOLD
+	jb near .threshold_exit
+	pshufd xmm0, xmm0, 0B1h
+	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
+	movmskps eax, xmm0
+	ret
+.threshold_exit:	
+	mov eax, 15
+	ret
+
+WELS_EXTERN MdInterAnalysisVaaInfo_sse2
+;***********************************************************************
+;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+;***********************************************************************
+ALIGN 16
+MdInterAnalysisVaaInfo_sse2:
+	mov eax, [esp+4]
+	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	pshufd xmm1, xmm0, 01Bh
+	paddd xmm1, xmm0
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2	
+	psrad xmm1, 02h		; iAverageSad
+	movdqa xmm2, xmm1
+	psrad xmm2, 06h
+	movdqa xmm3, xmm0	; iSadBlock
+	psrad xmm3, 06h
+	psubd xmm3, xmm2
+	
+	; to replace pmulld functionality as below
+	movdqa xmm2, xmm3	
+	pmuludq xmm2, xmm3
+	pshufd xmm4, xmm3, 0B1h
+	pmuludq xmm4, xmm4
+	movdqa xmm5, xmm2
+	punpckldq xmm5, xmm4
+	punpckhdq xmm2, xmm4
+	punpcklqdq xmm5, xmm2	
+	
+	pshufd xmm4, xmm5, 01Bh
+	paddd xmm4, xmm5
+	pshufd xmm5, xmm4, 0B1h
+	paddd xmm5, xmm4
+	movd eax, xmm5
+	cmp eax, 20	; INTER_VARIANCE_SAD_THRESHOLD
+	jb near .threshold_exit
+	pshufd xmm0, xmm0, 0B1h
+	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
+	movmskps eax, xmm0
+	ret
+.threshold_exit:	
+	mov eax, 15
+	ret
--- /dev/null
+++ b/codec/encoder/core/inc/IWelsVP.h
@@ -1,0 +1,306 @@
+/*!
+ * \copy
+ *     Copyright (c)  2004-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	    :  IWelsVP.h
+ *
+ * \brief	    :  Interface of wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  1. should support both C/C++ style interface
+ *                 2. should concern with the feature extension requirement 
+ *                 3. should care the usage of "char"==>
+ *                     1) value char  : signed char/unsigned char
+ *                     2) string char : char
+ *
+ *************************************************************************************
+ */
+
+#ifndef _IWELSVP_H_
+#define _IWELSVP_H_ 
+
+#ifdef _WIN32
+#define WELSAPI __stdcall
+#else
+#define WELSAPI 
+#endif
+
+#define WELSVP_MAJOR_VERSION   1
+#define WELSVP_MINOR_VERSION   1
+#define WELSVP_VERSION         ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
+
+typedef enum 
+{
+	RET_SUCCESS          =  0,
+	RET_FAILED           = -1,
+	RET_INVALIDPARAM     = -2,
+	RET_OUTOFMEMORY      = -3,
+	RET_NOTSUPPORTED       = -4,
+	RET_UNEXPECTED       = -5,
+	RET_NEEDREINIT		  = -6
+} EResult;
+
+typedef enum 
+{ 
+	VIDEO_FORMAT_NULL       = 0,   /* invalid format   */
+	/*rgb color formats*/
+	VIDEO_FORMAT_RGB        = 1,   /* rgb 24bits       */
+	VIDEO_FORMAT_RGBA       = 2,   /* rgba             */
+	VIDEO_FORMAT_RGB555     = 3,   /* rgb555           */
+	VIDEO_FORMAT_RGB565     = 4,   /* rgb565           */
+	VIDEO_FORMAT_BGR        = 5,   /* bgr 24bits       */
+	VIDEO_FORMAT_BGRA       = 6,   /* bgr 32bits       */
+	VIDEO_FORMAT_ABGR       = 7,   /* abgr             */
+	VIDEO_FORMAT_ARGB       = 8,   /* argb             */
+
+	/*yuv color formats*/
+	VIDEO_FORMAT_YUY2       = 20,   /* yuy2             */
+	VIDEO_FORMAT_YVYU       = 21,   /* yvyu             */
+	VIDEO_FORMAT_UYVY       = 22,   /* uyvy             */
+	VIDEO_FORMAT_I420       = 23,   /* yuv 4:2:0 planar */              
+	VIDEO_FORMAT_YV12       = 24,   /* yuv 4:2:0 planar */
+	VIDEO_FORMAT_INTERNAL   = 25,   /* Only Used for SVC decoder testbed */ 
+	VIDEO_FORMAT_NV12		= 26,	/* y planar + uv packed */
+	VIDEO_FORMAT_I422       = 27,   /* yuv 4:2:2 planar */
+	VIDEO_FORMAT_I444       = 28,   /* yuv 4:4:4 planar */
+	VIDEO_FORMAT_YUYV       = 20,   /* yuv 4:2:2 packed */
+
+	
+	VIDEO_FORMAT_RGB24      = 1,
+	VIDEO_FORMAT_RGB32      = 2,
+	VIDEO_FORMAT_RGB24_INV  = 5,
+	VIDEO_FORMAT_RGB32_INV  = 6,
+	VIDEO_FORMAT_RGB555_INV = 7,
+	VIDEO_FORMAT_RGB565_INV = 8,
+	VIDEO_FORMAT_YUV2       = 21,
+	VIDEO_FORMAT_420        = 23,
+
+
+	VIDEO_FORMAT_VFlip      = 0x80000000 
+} EVideoFormat;
+
+typedef enum 
+{ 
+	BUFFER_HOSTMEM  = 0,
+	BUFFER_SURFACE
+} EPixMapBufferProperty;
+
+typedef struct
+{
+  int iRectTop;
+  int iRectLeft;
+  int iRectWidth;
+  int iRectHeight;
+} SRect;
+
+typedef struct
+{
+	void        *pPixel[3]; 
+	int          iSizeInBits;
+	int          iStride[3];
+	SRect        sRect;	
+	EVideoFormat eFormat;
+	EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
+} SPixMap;
+
+typedef enum
+{	
+	METHOD_NULL              = 0,
+	METHOD_COLORSPACE_CONVERT    ,//not support yet
+	METHOD_DENOISE              ,
+	METHOD_SCENE_CHANGE_DETECTION ,
+	METHOD_DOWNSAMPLE			  ,
+	METHOD_VAA_STATISTICS        ,
+    METHOD_BACKGROUND_DETECTION  ,
+	METHOD_ADAPTIVE_QUANT ,
+	METHOD_COMPLEXITY_ANALYSIS   ,
+	METHOD_IMAGE_ROTATE		  ,
+	METHOD_MASK                 
+} EMethods;
+
+//-----------------------------------------------------------------//
+//  Algorithm parameters define
+//-----------------------------------------------------------------//
+
+typedef struct
+{
+	int bSceneChangeFlag; // 0:false ; 1:true
+} SSceneChangeResult;
+
+typedef enum
+{
+	SIMILAR_SCENE,      //similar scene 
+	MEDIUM_CHANGED_SCENE,   //medium changed scene
+	LARGE_CHANGED_SCENE,   //large changed scene
+} ESceneChangeIdc;
+
+typedef struct
+{
+	unsigned char *pCurY;					// Y data of current frame
+	unsigned char *pRefY;					// Y data of pRef frame for diff calc
+	int (*pSad8x8)[4];				// sad of 8x8, every 4 in the same 16x16 get together
+	int *pSsd16x16;					// sum of square difference of 16x16
+	int *pSum16x16;					// sum of 16x16
+	int *pSumOfSquare16x16;					// sum of square of 16x16
+	int	(*pSumOfDiff8x8)[4];
+	unsigned char	(*pMad8x8)[4];
+	int iFrameSad;					// sad of frame
+} SVAACalcResult;
+
+typedef struct
+{
+	int iCalcVar;
+	int iCalcBgd;
+	int iCalcSsd;
+	int iReserved;
+	SVAACalcResult	*pCalcResult;
+} SVAACalcParam;
+
+typedef struct
+{
+	signed char		*pBackgroundMbFlag;
+	SVAACalcResult  *pCalcRes;
+} SBGDInterface;
+
+typedef enum
+{
+	AQ_QUALITY_MODE,   //Quality mode
+	AQ_BITRATE_MODE,   //Bitrate mode
+}EAQModes;
+
+typedef struct 
+{
+	unsigned short    uiMotionIndex;
+	unsigned short    uiTextureIndex;
+} SMotionTextureUnit;
+
+typedef struct
+{
+	int					iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
+	SVAACalcResult		*pCalcResult;
+	SMotionTextureUnit  *pMotionTextureUnit;
+
+	signed char			*pMotionTextureIndexToDeltaQp;	
+	double				dAverMotionTextureIndexToDeltaQp;
+} SAdaptiveQuantizationParam;
+
+typedef enum 
+{
+	FRAME_SAD     =  0,
+	GOM_SAD       = -1,
+	GOM_VAR       = -2
+} EComplexityAnalysisMode;
+
+typedef struct
+{
+	int  iComplexityAnalysisMode;
+	int  iCalcBgd;
+	int  iMbNumInGom;		
+	int  iFrameComplexity;
+	int  *pGomComplexity;
+	int  *pGomForegroundBlockNum;
+	signed char  *pBackgroundMbFlag;
+	unsigned int *uiRefMbType;
+	SVAACalcResult  *pCalcResult;
+} SComplexityAnalysisParam;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct 
+{
+	void    *pCtx;
+	EResult (*Init)    (void *pCtx, int iType, void *pCfg);
+	EResult (*Uninit)  (void *pCtx, int iType);
+	EResult (*Flush)   (void *pCtx, int iType);
+	EResult (*Process) (void *pCtx, int iType, SPixMap *pSrc, SPixMap *dst); 
+	EResult (*Get)     (void *pCtx, int iType, void *pParam); 
+	EResult (*Set)     (void *pCtx, int iType, void *pParam); 
+	EResult (*SpecialFeature) (void *pCtx, int iType, void *pIn, void *pOut);
+} IWelsVPc;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)  /* C++ style interface */
+
+class IWelsVP
+{
+public:
+	virtual ~IWelsVP() {}
+
+public:		
+	virtual EResult Init    (int iType, void *pCfg) = 0; 
+	virtual EResult Uninit  (int iType) = 0;
+	virtual EResult Flush   (int iType) = 0;
+	virtual EResult Process (int iType, SPixMap *pSrc, SPixMap *dst) = 0; 
+	virtual EResult Get     (int iType, void *pParam) = 0; 
+	virtual EResult Set     (int iType, void *pParam) = 0; 
+	virtual EResult SpecialFeature (int iType, void *pIn, void *pOut) = 0;
+};
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(a, b)              
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(a)               
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(a)                
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(a, b, c)        
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(a, b)               
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(a, b)               
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(a, b, c)
+
+/* C++ interface version */
+#define WELSVP_INTERFACE_VERION                    (0x8000 + (WELSVP_VERSION & 0x7fff)) 
+#define WELSVP_EXTERNC_BEGIN                       extern "C" {
+#define WELSVP_EXTERNC_END                         }
+
+#else    /* C style interface */
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(p->h, a, b)              
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(p->h, a)               
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(p->h, a)                
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(p->h, a, b, c)        
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(p->h, a, b)               
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(p->h, a, b)               
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(p->h, a, b, c) 
+
+/* C interface version */
+#define WELSVP_INTERFACE_VERION                    (0x0001 + (WELSVP_VERSION & 0x7fff)) 
+#define WELSVP_EXTERNC_BEGIN                      
+#define WELSVP_EXTERNC_END                       
+
+#endif
+
+WELSVP_EXTERNC_BEGIN
+EResult WELSAPI CreateVpInterface   (void **ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
+EResult WELSAPI DestroyVpInterface  (void *pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
+WELSVP_EXTERNC_END
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+#endif // _IWELSVP_H_
+
+
--- /dev/null
+++ b/codec/encoder/core/inc/array_stack_align.h
@@ -1,0 +1,121 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file		array_stack_align.h
+ *
+ * \brief	promised alignment of array pData declaration on stack
+ *			multidimensional array can be extended if applicable need
+ *
+ * \date		8/8/2011 Created 
+ *			8/12/2011 functionality implementation for multidimensional array
+ *			8/26/2011 better solution with reducing extra memory used, 
+ *						stack size is adaptively reduced by _tp & _al
+ *
+ *************************************************************************************
+ */
+#ifndef ARRAY_STACK_ALIGN_H__
+#define ARRAY_STACK_ALIGN_H__
+
+#include <assert.h>
+#include "typedefs.h"
+
+/*
+ * ENFORCE_STACK_ALIGN_1D: force 1 dimension local pData aligned in stack
+ * _tp: type
+ * _nm: var name
+ * _sz: size
+ * _al: align bytes
+ * auxiliary var: _nm ## _tEmP
+ * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
+ */
+
+//#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
+//_tp _nm ## _tEmP[(_sz)+(_al)-1]; \
+//_tp *_nm = _nm ## _tEmP + ((_al)-1); \
+//_nm -= (((int32_t)_nm & ((_al)-1))/sizeof(_tp));
+
+/* Another better solution with reducing extra memory used */
+#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
+assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+_tp _nm ## _tEmP[(_sz)+(_al)/sizeof(_tp)-1]; \
+_tp *_nm = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+_nm -= (((int32_t)_nm & ((_al)-1))/sizeof(_tp));
+
+/*
+ * ENFORCE_STACK_ALIGN_2D: force 2 dimension local pData aligned in stack
+ * _tp: type
+ * _nm: var name
+ * _cx, _cy: size in x, y dimension
+ * _al: align bytes
+ * auxiliary var: _nm ## _tEmP, _nm ## _tEmP_al
+ * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
+ */
+
+//#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
+//_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)-1]; \
+//_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)-1); \
+//_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+//_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
+
+/* Another better solution with reducing extra memory used */
+#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
+assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
+_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
+
+/*
+ * ENFORCE_STACK_ALIGN_3D: force 3 dimension local pData aligned in stack
+ * _tp: type
+ * _nm: var name
+ * _cx, _cy, _cz: size in x, y, z dimension
+ * _al: align bytes
+ * auxiliary var: _nm ## _tEmP, _nm ## _tEmP_al
+ * NOTE: _al should be power-of-2 and >= sizeof(_tp), before considering to use such macro
+ */
+
+//#define ENFORCE_STACK_ALIGN_3D(_tp, _nm, _cx, _cy, _cz, _al) \
+//_tp _nm ## _tEmP[(_cx)*(_cy)*(_cz)+(_al)-1]; \
+//_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)-1); \
+//_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+//_tp (*_nm)[(_cy)][(_cz)] = (_tp (*)[(_cy)][(_cz)])_nm ## _tEmP_al;
+
+/* Another better solution with reducing extra memory used */
+#define ENFORCE_STACK_ALIGN_3D(_tp, _nm, _cx, _cy, _cz, _al) \
+assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+_tp _nm ## _tEmP[(_cx)*(_cy)*(_cz)+(_al)/sizeof(_tp)-1]; \
+_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+_nm ## _tEmP_al -= (((int32_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+_tp (*_nm)[(_cy)][(_cz)] = (_tp (*)[(_cy)][(_cz)])_nm ## _tEmP_al;
+
+#endif//ARRAY_STACK_ALIGN_H__
+
--- /dev/null
+++ b/codec/encoder/core/inc/as264_common.h
@@ -1,0 +1,160 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	common.h
+ *
+ * \brief	common flag definitions
+ *
+ * \date	7/6/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef _AS264_COMMON_H_
+#define _AS264_COMMON_H_
+
+/* debug setting for console
+$(TargetPath)
+-iper 60 -lqp 26 -frin 2 -rc 1 -cf 4 -org desktop.bgra -sw 800 -sh 592 -bf desktop.h264
+.\..\..\..\..\bin
+*/
+
+/****************************************************************************
+ * Options for algorithm, usually change bitrate 
+ ****************************************************************************/
+#define DISABLE_FMO_FEATURE	// 
+
+/****************************************************************************
+ * Options for optimization, not change bitrate 
+ ****************************************************************************/
+//#undef	X86_ASM			// X86_ASM is included in project preprocessor definitions, undef it when need to disable asm code 
+#define SINGLE_REF_FRAME		// need to disable it when use multi-reference
+
+
+#if defined(WELS_TESTBED)	    // for SGE testing
+#define ENABLE_FRAME_DUMP
+
+#ifdef FRAME_INFO_OUTPUT
+#undef FRAME_INFO_OUTPUT
+#endif//FRAME_INFO_OUTPUT
+#endif//WELS_TESTBED
+
+
+#if defined(__UNITTEST__)	// for unittest
+#ifndef ENABLE_FRAME_DUMP
+#define ENABLE_FRAME_DUMP
+#endif//ENABLE_FRAME_DUMP
+#endif//__UNITTEST__
+
+//#define ENABLE_PSNR_CALC
+//#define STAT_OUTPUT
+//#define MB_TYPES_CHECK
+//
+//#define FRAME_INFO_OUTPUT
+//#define LAYER_INFO_OUTPUT
+//#define SLICE_INFO_OUTPUT		// useful in multiple slice coding track
+//#define MB_TYPES_INFO_OUTPUT
+
+
+/* macros dependencies check */
+//@if !FRAME_INFO_OUTPUT
+#if !defined(FRAME_INFO_OUTPUT)
+
+#if defined(ENABLE_PSNR_CALC)
+#undef ENABLE_PSNR_CALC
+#endif//ENABLE_PSNR_CALC
+
+//#if defined(STAT_OUTPUT)
+//#undef STAT_OUTPUT
+//#endif//STAT_OUTPUT
+
+#if defined(LAYER_INFO_OUTPUT)
+#undef LAYER_INFO_OUTPUT
+#endif//LAYER_INFO_OUTPUT
+
+#if defined(SLICE_INFO_OUTPUT)
+#undef SLICE_INFO_OUTPUT
+#endif//SLICE_INFO_OUTPUT
+
+#if defined(MB_TYPES_INFO_OUTPUT)
+#undef MB_TYPES_INFO_OUTPUT
+#endif//MB_TYPES_INFO_OUTPUT
+
+#endif//FRAME_INFO_OUTPUT
+
+//@if SLICE_INFO_OUTPUT
+#if defined(SLICE_INFO_OUTPUT)
+
+#if !defined(FRAME_INFO_OUTPUT)
+#define FRAME_INFO_OUTPUT
+#endif//FRAME_INFO_OUTPUT
+
+#if !defined(LAYER_INFO_OUTPUT)
+#define LAYER_INFO_OUTPUT
+#endif//LAYER_INFO_OUTPUT
+
+#endif//SLICE_INFO_OUTPUT
+
+#if defined(LAYER_INFO_OUTPUT)
+
+#if !defined(FRAME_INFO_OUTPUT)
+#define FRAME_INFO_OUTPUT
+#endif//!FRAME_INFO_OUTPUT
+
+#endif//LAYER_INFO_OUTPUT
+
+//@if MB_TYPES_INFO_OUTPUT
+#if defined(MB_TYPES_INFO_OUTPUT)
+
+#if !defined(MB_TYPES_CHECK)
+#define MB_TYPES_CHECK
+#endif//MB_TYPES_CHECK
+#endif//MB_TYPES_INFO_OUTPUT
+
+// NOTE: please do not clean below lines even comment, turn on for potential memory leak verify and memory usage monitor etc.
+//#define MEMORY_CHECK
+#define MEMORY_MONITOR
+#ifdef MEMORY_CHECK
+#ifndef MEMORY_MONITOR
+#define MEMORY_MONITOR
+#endif//MEMORY_MONITOR
+#endif//MEMORY_CHECK
+
+//#define ENABLE_TRACE_FILE
+#if defined(WIN32) && defined(_DEBUG)
+#ifdef ENABLE_TRACE_FILE
+#undef ENABLE_TRACE_FILE
+#endif//ENABLE_TRACE_FILE
+#endif//WIN32 & _DEBUG
+
+
+#endif // _AS264_COMMON_H_
+
--- /dev/null
+++ b/codec/encoder/core/inc/au_set.h
@@ -1,0 +1,139 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	au_set.h
+ *
+ * \brief	Interfaces introduced in Access Unit level based writer
+ *
+ * \date	05/18/2009 Created
+ *			05/21/2009 Added init_sps and init_pps
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_ACCESS_UNIT_WRITER_H__
+#define WELS_ACCESS_UNIT_WRITER_H__
+
+#include "bit_stream.h"
+#include "parameter_sets.h"
+#include "param_svc.h"
+
+namespace WelsSVCEnc {
+/*! 
+ *************************************************************************************
+ * \brief	to write Sequence Parameter Set (SPS)
+ *
+ * \param 	pSps     	SWelsSPS to be wrote
+ * \param	bs_aux		bitstream writer auxiliary 
+ *
+ * \return	0 - successed
+ *		    1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is SPS.
+ *************************************************************************************
+ */
+
+int32_t WelsWriteSpsNal( SWelsSPS *pSps, SBitStringAux *pBitStringAux, int32_t* pSpsIdDelta );
+
+
+/*! 
+ *************************************************************************************
+ * \brief	to write SubSet Sequence Parameter Set
+ *
+ * \param 	sub_sps		subset pSps parsed
+ * \param	bs_aux		bitstream writer auxiliary 
+ *
+ * \return	0 - successed
+ *		    1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is SubSet SPS.
+ *************************************************************************************
+ */
+int32_t WelsWriteSubsetSpsSyntax( SSubsetSps *pSubsetSps, SBitStringAux *pBitStringAux , int32_t* pSpsIdDelta );
+
+
+/*! 
+ *************************************************************************************
+ * \brief	to write Picture Parameter Set (PPS)
+ *
+ * \param 	pPps     	pPps
+ * \param	bs_aux		bitstream writer auxiliary 
+ *
+ * \return	0 - successed
+ *		    1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is PPS.
+ *************************************************************************************
+ */
+int32_t WelsWritePpsSyntax( SWelsPPS *pPps, SBitStringAux *pBitStringAux, SParaSetOffset* sPSOVector );
+
+/*!
+ * \brief	initialize pSps based on configurable parameters in svc
+ * \param	pSps				SWelsSPS*
+ * \param	layer_param		SDLayerParam*, dependency layer parameter
+ * \param	iSpsId			SPS Id
+ * \return	0 - successful
+ *			1 - failed
+ */
+int32_t WelsInitSps( SWelsSPS *pSps, SDLayerParam *pLayerParam, const uint32_t kuiIntraPeriod, const int32_t kiNumRefFrame,
+					  const uint32_t kiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc );
+
+/*!
+ * \brief	initialize subset pSps based on configurable parameters in svc
+ * \param	pSubsetSps		SSubsetSps*
+ * \param	layer_param		SDLayerParam*, dependency layer parameter
+ * \param	kiSpsId			SPS Id
+ * \return	0 - successful
+ *			1 - failed
+ */
+int32_t WelsInitSubsetSps( SSubsetSps *pSubsetSps, SDLayerParam *pLayerParam, const uint32_t kuiIntraPeriod, const int32_t kiNumRefFrame,
+							 const uint32_t kiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc );
+
+/*!
+ * \brief	initialize pPps based on configurable parameters and pSps(subset pSps) in svc
+ * \param	pPps							SWelsPPS*
+ * \param	pSps							SWelsSPS*
+ * \param	pSubsetSps					SSubsetSps*
+ * \param   kbDeblockingFilterPresentFlag			bool_t
+ * \param	kiPpsId						PPS Id
+ * \param	kbUsingSubsetSps					bool_t
+ * \return	0 - successful
+ *			1 - failed
+ */
+int32_t WelsInitPps(	SWelsPPS *pPps,
+						SWelsSPS *pSps,
+						SSubsetSps *pSubsetSps,						
+						const uint32_t kuiPpsId,
+						const bool_t kbDeblockingFilterPresentFlag,
+						const bool_t kbUsingSubsetSps );
+
+}
+#endif//WELS_ACCESS_UNIT_PARSER_H__
--- /dev/null
+++ b/codec/encoder/core/inc/bit_stream.h
@@ -1,0 +1,75 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//bit_stream.h	-	bit-stream reading and / writing auxiliary pData
+#ifndef WELS_BIT_STREAM_H__
+#define WELS_BIT_STREAM_H__
+
+#include "typedefs.h"
+//#include "macros.h"
+
+/*
+ *	auxiliary struct for bit-stream reading / writing
+ */
+typedef struct TagBitStringAux {
+	uint8_t		*pBuf;		// pBuffer to start position
+	uint8_t		*pBufEnd;	// pBuffer + length
+	uint8_t		*pBufPtr;	// current writing position	
+	uint32_t    uiCurBits;  
+	int32_t		iLeftBits;	// count number of available bits left ([1, 8]),
+							// need pointer to next byte start position in case 0 bit left then 8 instead
+}SBitStringAux;
+
+/*!
+ * \brief	input bits for decoder or initialize bitstream writing in encoder
+ *
+ * \param	pBs		Bit string auxiliary pointer
+ * \param	pBuf		bit-stream pBuffer
+ * \param	iSize	iSize in bits for decoder; iSize in bytes for encoder
+ *
+ * \return	iSize of pBuffer pData in byte; failed in -1 return
+ */
+static inline int32_t InitBits( SBitStringAux *pBs, const uint8_t *kpBuf, const int32_t kiSize )
+{
+	uint8_t *ptr = (uint8_t *)kpBuf;
+
+	pBs->pBuf			= ptr;
+	pBs->pBufPtr		= ptr;
+	pBs->pBufEnd		= ptr + kiSize;
+	pBs->iLeftBits	= 32;
+	pBs->uiCurBits = 0;
+	
+	return kiSize;
+}
+
+
+#endif//WELS_BIT_STREAM_H__
--- /dev/null
+++ b/codec/encoder/core/inc/bundleloader.h
@@ -1,0 +1,153 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_BOUNDLELOAD_H
+#define WELS_BOUNDLELOAD_H
+
+#if defined(MACOS)
+
+#include <dlfcn.h>
+#include <carbon/carbon.h>
+#include <coreFoundation/CFBundle.h>
+#include <string>
+
+int GetCurrentModulePath(char* lpModulePath, const int iPathMax)
+{
+	if(lpModulePath == NULL || iPathMax <= 0)
+	{
+		return -1;
+	}
+
+	memset(lpModulePath, 0, iPathMax);
+
+	char cCurrentPath[PATH_MAX];
+	memset(cCurrentPath, 0, PATH_MAX);
+
+	Dl_info 	dlInfo;
+	static int  sDummy;
+	dladdr((void*)&sDummy, &dlInfo);
+
+	strlcpy(cCurrentPath, dlInfo.dli_fname, PATH_MAX);
+
+	// whether is self a framework ? 
+	int locateNumber = 1;
+	struct FSRef currentPath;
+	OSStatus iStatus = FSPathMakeRef((unsigned char*)cCurrentPath, &currentPath, NULL);
+	if(noErr == iStatus)
+	{
+		LSItemInfoRecord  info;
+		iStatus = LSCopyItemInfoForRef(&currentPath, kLSRequestExtension, &info);
+		if(noErr == iStatus && NULL == info.extension)
+		{
+			locateNumber = 4;
+		}
+	}
+	std::string strPath(cCurrentPath);
+	int pos = std::string::npos;
+	for(int i = 0; i < locateNumber; i++)
+	{
+		pos = strPath.rfind('/');
+		if(std::string::npos == pos)
+		{
+			break;
+		}
+		strPath.erase(pos);
+	}
+	if(std::string::npos == pos)
+	{
+		return -2;
+	}
+	cCurrentPath[pos] = 0;
+
+	strlcpy(lpModulePath, cCurrentPath, iPathMax);
+	strlcat(lpModulePath, "/", iPathMax);
+
+	return 0;
+}
+
+CFBundleRef LoadBundle(const char* lpBundlePath)
+{
+	if(lpBundlePath == NULL)
+	{
+		return NULL;
+	}
+
+	struct FSRef bundlePath;
+	OSStatus iStatus = FSPathMakeRef((unsigned char*)lpBundlePath, &bundlePath, NULL);
+	if(noErr != iStatus)
+	{
+		return NULL;
+	}
+
+	CFURLRef bundleURL = CFURLCreateFromFSRef(kCFAllocatorSystemDefault, &bundlePath);
+	if(NULL == bundleURL)
+	{
+		return NULL;
+	}
+
+	// 2.get bundle ref
+	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
+	CFRelease(bundleURL);
+
+	//	Boolean bReturn = FALSE;
+	if(NULL != bundleRef)
+	{
+		//	bReturn = CFBundleLoadExecutable(bundleRef);
+	}
+
+	return bundleRef;
+}
+
+Boolean FreeBundle(CFBundleRef bundleRef)
+{
+	if(NULL != bundleRef)
+	{
+		//	CFBundleUnloadExecutable(bundleRef);
+		CFRelease(bundleRef);
+	}
+	return TRUE;
+}
+
+void* GetProcessAddress(CFBundleRef bundleRef, const char* lpProcName)
+{
+	void *processAddress = NULL;
+	if(NULL != bundleRef)
+	{
+		CFStringRef cfProcName = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
+		processAddress = CFBundleGetFunctionPointerForName(bundleRef, cfProcName);
+		CFRelease(cfProcName);
+	}
+	return processAddress;
+}
+#endif
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/core/inc/cpu.h
@@ -1,0 +1,78 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu.h
+ *
+ * \brief	CPU feature compatibility detection
+ *
+ * \date	04/29/2009
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CPU_DETECTION_H__)
+#define WELS_CPU_DETECTION_H__
+
+#include "typedefs.h"
+
+namespace WelsSVCEnc {
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+/*
+ *	cpuid support verify routine
+ *  return 0 if cpuid is not supported by cpu
+ */
+int32_t  WelsCPUIdVerify();
+
+void WelsCPUId( uint32_t uiIndex, uint32_t *pFeatureA, uint32_t *pFeatureB, uint32_t *pFeatureC, uint32_t *pFeatureD );
+
+int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx );
+int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx );
+
+void WelsEmms();
+
+uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors );
+
+/*
+ *	clear FPU registers states for potential float based calculation if support
+ */
+void     WelsCPURestore( const uint32_t kuiCPU );
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+}
+#endif//WELS_CPU_DETECTION_H__
--- /dev/null
+++ b/codec/encoder/core/inc/cpu_core.h
@@ -1,0 +1,80 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu_core.h
+ *
+ * \brief	cpu core feature detection
+ *
+ * \date	4/24/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CPU_CORE_FEATURE_DETECTION_H__)
+#define WELS_CPU_CORE_FEATURE_DETECTION_H__
+
+/*
+ *	WELS CPU feature flags
+ */ 
+#define WELS_CPU_MMX        0x00000001    /* mmx */
+#define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
+#define WELS_CPU_SSE        0x00000004    /* sse */
+#define WELS_CPU_SSE2       0x00000008    /* sse 2 */
+#define WELS_CPU_SSE3       0x00000010    /* sse 3 */
+#define WELS_CPU_SSE41      0x00000020    /* sse 4.1 */
+#define WELS_CPU_3DNOW      0x00000040    /* 3dnow! */
+#define WELS_CPU_3DNOWEXT   0x00000080    /* 3dnow! ext */
+#define WELS_CPU_ALTIVEC    0x00000100    /* altivec */
+#define WELS_CPU_SSSE3      0x00000200    /* ssse3 */
+#define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
+
+/* CPU features application extensive */
+#define WELS_CPU_AVX		0x00000800	/* Advanced Vector eXtentions */
+#define WELS_CPU_FPU		0x00001000	/* x87-FPU on chip */
+#define WELS_CPU_HTT		0x00002000	/* Hyper-Threading Technology (HTT), Multi-threading enabled feature: 
+										   physical processor package is capable of supporting more than one logic processor
+										*/
+#define WELS_CPU_CMOV		0x00004000	/* Conditional Move Instructions,
+										   also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
+										*/
+#define WELS_CPU_MOVBE		0x00008000	/* MOVBE instruction */
+#define WELS_CPU_AES		0x00010000	/* AES instruction extensions */
+#define WELS_CPU_FMA		0x00020000	/* AVX VEX FMA instruction sets */
+
+#define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
+#define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
+#define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
+#define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
+
+/*
+ *	Interfaces for CPU core feature detection as below
+ */
+
+#endif//WELS_CPU_CORE_FEATURE_DETECTION_H__
--- /dev/null
+++ b/codec/encoder/core/inc/crt_util_safe_x.h
@@ -1,0 +1,393 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	crt_util_safe_x.h
+ *
+ * \brief	Safe CRT like util for cross platfroms support
+ *
+ * \date	06/04/2010 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
+#define WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <math.h>
+
+#include <time.h>
+#if defined(WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#include "typedefs.h"
+#endif//WIN32
+
+/* 
+ * Safe Lib specific errno codes.  These can be added to the errno.h file
+ * if desired. 
+ */
+#define ESNULLP         ( 400 )       /* null ptr                    */  
+#define ESZEROL         ( 401 )       /* length is zero              */  
+#define ESLEMIN         ( 402 )       /* length is below min         */  
+#define ESLEMAX         ( 403 )       /* length exceeds max          */  
+#define ESOVRLP         ( 404 )       /* overlap undefined           */ 
+#define ESEMPTY         ( 405 )       /* empty string                */ 
+#define ESNOSPC         ( 406 )       /* not enough space for s2     */  
+#define ESUNTERM        ( 407 )       /* unterminated string         */  
+#define ESNODIFF        ( 408 )       /* no difference               */ 
+#define ESNOTFND        ( 409 )       /* not found                   */ 
+
+/* EOK may or may not be defined in errno.h */ 
+#ifndef EOK 
+#define EOK   0
+#endif
+
+#if (defined(WIN32) && defined(_MSC_VER) && (_MSC_VER<1500)) || defined(__GNUC__)
+
+static __inline int wels_strncpy_s( char *dest, int dmax, const char *src, int slen )
+{
+	int orig_dmax;
+    char *orig_dest;
+    const char *overlap_bumper;
+
+    if (dest == NULL) {
+//        invoke_safe_lib_constraint_handler("strncpy_s: dest is null", 
+//                   NULL, ESNULLP);
+        return (ESNULLP);
+    }
+
+    if (dmax <= 0) {
+//        invoke_safe_lib_constraint_handler("strncpy_s: dmax is 0", 
+//                   NULL, ESZEROL);
+        return (ESZEROL);
+    }
+
+//    if (dmax > RSIZE_MAX_STR) {
+//        invoke_safe_lib_constraint_handler("strncpy_s: dmax exceeds max", 
+//                   NULL, ESLEMAX);
+//        return (ESLEMAX);
+//    }
+
+	if (src == NULL) {
+//        handle_error(orig_dest, orig_dmax, "strncpy_s: src is null", ESNULLP);
+        return (ESNULLP);
+    }
+
+    if (slen <= 0) {
+//        handle_error(orig_dest, orig_dmax, "strncpy_s: slen is zero", ESZEROL);
+        return (ESZEROL);
+    }
+
+//    if (slen > RSIZE_MAX_STR) {
+//        handle_error(orig_dest, orig_dmax, "strncpy_s: slen exceeds max", ESLEMAX);
+//        return (ESLEMAX);
+//    }
+
+    /* hold base in case src was not copied */  
+    orig_dmax = dmax;
+    orig_dest = dest;
+
+	if (dest < src) {
+       overlap_bumper = src;
+
+        while (dmax > 0) {
+            if (dest == overlap_bumper) {
+//                handle_error(orig_dest, orig_dmax, "strncpy_s: overlapping objects", ESOVRLP);
+                return (ESOVRLP); 
+            }
+
+			if (slen == 0) {
+                /*
+                 * Copying truncated to slen chars.  Note that the TR says to
+                 * copy slen chars plus the null char.  We null the slack.
+                 */
+#ifdef SAFE_LIB_STR_NULL_SLACK
+                while (dmax) { *dest = '\0'; dmax--; dest++; }
+#else
+                *dest = '\0'; 
+#endif 
+                return (EOK);
+			}
+
+            *dest = *src;
+            if (*dest == '\0') {
+#ifdef SAFE_LIB_STR_NULL_SLACK
+                /* null slack */
+                while (dmax) { *dest = '\0'; dmax--; dest++; }
+#endif 
+                return (EOK);
+            }
+
+            dmax--;
+            slen--;
+            dest++;
+            src++;
+        }
+
+    } else { 
+        overlap_bumper = dest;
+
+        while (dmax > 0) {
+            if (src == overlap_bumper) {
+//                handle_error(orig_dest, orig_dmax, "strncpy_s: overlapping objects", ESOVRLP);
+                return (ESOVRLP); 
+            }
+
+	    if (slen == 0) {
+                /*
+                 * Copying truncated to slen chars.  Note that the TR says to
+                 * copy slen chars plus the null char.  We null the slack.
+                 */
+#ifdef SAFE_LIB_STR_NULL_SLACK
+                while (dmax) { *dest = '\0'; dmax--; dest++; }
+#else
+                *dest = '\0'; 
+#endif 
+                return (EOK);
+            }
+
+            *dest = *src;
+            if (*dest == '\0') {
+#ifdef SAFE_LIB_STR_NULL_SLACK
+                /* null slack */
+                while (dmax) { *dest = '\0'; dmax--; dest++; }
+#endif 
+                return (EOK);
+            }
+
+            dmax--;
+            slen--;
+            dest++;
+            src++;
+        }
+    } 
+
+    /*
+     * the entire src was not copied, so zero the string
+     */
+//    handle_error(orig_dest, orig_dmax, "strncpy_s: not enough space for src", ESNOSPC);
+    return (ESNOSPC);
+}
+
+static __inline int wels_strcat_s(char *dest, int dmax, const char *src)
+{
+	int orig_dmax;
+    char *orig_dest;
+    const char *overlap_bumper;
+
+    if (dest == NULL) {
+//        invoke_safe_lib_constraint_handler("strcat_s: dest is null", 
+//                   NULL, ESNULLP);
+        return (ESNULLP);
+    }
+
+    if (src == NULL) {
+//        invoke_safe_lib_constraint_handler("strcat_s: src is null", 
+//                   NULL, ESNULLP);
+        return (ESNULLP);
+    }
+
+    if (dmax <= 0) {
+//        invoke_safe_lib_constraint_handler("strcat_s: dmax is 0", 
+//                   NULL, ESZEROL);
+        return (ESZEROL);
+    }
+
+//    if (dmax > RSIZE_MAX_STR) {
+//        invoke_safe_lib_constraint_handler("strcat_s: dmax exceeds max", 
+//                   NULL, ESLEMAX);
+//        return (ESLEMAX);
+//    }
+
+    /* hold base of dest in case src was not copied */
+    orig_dmax = dmax;
+    orig_dest = dest;
+
+    if (dest < src) {
+        overlap_bumper = src;
+
+        /* Find the end of dest */
+        while (*dest != '\0') {
+ 
+            if (dest == overlap_bumper) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP); 
+                return (ESOVRLP);
+            }
+
+            dest++;
+            dmax--;
+            if (dmax == 0) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: dest unterminated", ESUNTERM); 
+                return (ESUNTERM);
+            }
+        }
+
+        while (dmax > 0) {
+            if (dest == overlap_bumper) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP); 
+                return (ESOVRLP);
+            }
+
+            *dest = *src;
+            if (*dest == '\0') {
+#ifdef SAFE_LIB_STR_NULL_SLACK
+                /* null slack to clear any data */
+                while (dmax) { *dest = '\0'; dmax--; dest++; }
+#endif 
+                return (EOK);
+            }
+
+            dmax--;
+            dest++;
+            src++;
+        }
+
+    } else {
+        overlap_bumper = dest;
+
+        /* Find the end of dest */
+        while (*dest != '\0') {
+
+            /*
+             * NOTE: no need to check for overlap here since src comes first
+             * in memory and we're not incrementing src here.
+             */
+            dest++;
+            dmax--;
+            if (dmax == 0) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: dest unterminated", ESUNTERM); 
+                return (ESUNTERM);
+            }
+        }
+
+        while (dmax > 0) {
+            if (src == overlap_bumper) {
+//                handle_error(orig_dest, orig_dmax, "strcat_s: overlapping objects", ESOVRLP); 
+                return (ESOVRLP);
+            }
+
+            *dest = *src;
+            if (*dest == '\0') {
+#ifdef SAFE_LIB_STR_NULL_SLACK
+                /* null slack to clear any data */
+                while (dmax) { *dest = '\0'; dmax--; dest++; }
+#endif 
+                return (EOK);
+            }
+
+            dmax--;
+            dest++;
+            src++;
+        }
+    } 
+
+    /*
+     * the entire src was not copied, so null the string 
+     */
+//    handle_error(orig_dest, orig_dmax, "strcat_s: not enough space for src", ESNOSPC); 
+
+    return (ESNOSPC);
+}
+
+static __inline int wels_strnlen_s(const char *dest, int dmax)
+{
+    int count;
+
+    if (dest == NULL) {
+        return (0);
+    }
+
+    if (dmax <= 0) { 
+//        invoke_safe_lib_constraint_handler("strnlen_s: dmax is 0", 
+//                   NULL, ESZEROL);
+        return (0);
+    }
+
+//    if (dmax > RSIZE_MAX_STR) {
+//        invoke_safe_lib_constraint_handler("strnlen_s: dmax exceeds max", 
+//                   NULL, ESLEMAX);
+//        return (0);
+//    }
+
+    count = 0;
+    while (*dest && dmax) {
+        count++;
+        dmax--;
+        dest++;
+    }
+
+    return (count);
+}
+
+#endif//(WIN32 && _MSC_VER && _MSC_VER<1500) || __GNUC__
+
+#if defined(WIN32)
+
+#ifdef _MSC_VER
+#if _MSC_VER >= 1500	// VS2008
+#define SNPRINTF	_snprintf_s
+#define LOCALTIME	localtime_s
+#define FTIME		_ftime_s
+#define STRNCPY		strncpy_s
+#define STRCAT		strcat_s
+#define STRNLEN		strnlen_s
+#define VSPRINTF	vsprintf_s
+#define FOPEN		fopen_s
+#else	// mainly for VC6
+#define SNPRINTF	_snprintf
+#define LOCALTIME	localtime
+#define FTIME		_ftime
+#define STRNCPY		wels_strncpy_s	// override s.t.r.n.c.p.y here for safe
+#define STRCAT		wels_strcat_s	// override s.t.r.c.a.t here for safe
+#define STRNLEN		wels_strnlen_s	// override s.t.r.n.l.e.n here for safe
+#define VSPRINTF	vsprintf
+#define FOPEN		fopen
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER
+
+#else//__GNUC__
+
+#define SNPRINTF	snprintf
+#define LOCALTIME	localtime
+#define STRNCPY		wels_strncpy_s	// override s.t.r.n.c.p.y here for safe
+#define STRCAT		wels_strcat_s	// override s.t.r.c.a.t here for safe
+#define STRNLEN		wels_strnlen_s	// override s.t.r.n.l.e.n here for safe
+#define VSPRINTF	vsprintf
+#define FOPEN		fopen
+
+#endif//WIN32
+
+#endif//WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
--- /dev/null
+++ b/codec/encoder/core/inc/deblocking.h
@@ -1,0 +1,117 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	deblocking.h
+ *
+ * \brief	Interfaces introduced in frame deblocking filtering
+ *
+ * \date	08/03/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_DEBLOCKING_H_
+#define WELS_DEBLOCKING_H_
+
+#include "encoder_context.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+
+
+//struct tagDeblockingFunc;
+
+typedef struct TagDeblockingFilter {
+	uint8_t		*pCsData[3];	// pointer to reconstructed picture pData
+	int32_t		iCsStride[3];	// Cs iStride
+	int16_t     iMbStride;	
+	int8_t		iSliceAlphaC0Offset;
+	int8_t		iSliceBetaOffset;
+	uint8_t     uiLumaQP;
+	uint8_t     uiChromaQP;
+	uint8_t     uiFilterIdc;
+	uint8_t     uiReserved;
+}SDeblockingFilter;
+
+void DeblockLumaLt4_c( uint8_t *pPixY, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void DeblockLumaEq4_c( uint8_t *pPixY, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta );
+void DeblockChromaLt4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void DeblockChromaEq4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta );
+
+
+void DeblockLumaLt4V_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void DeblockLumaEq4V_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+
+void DeblockLumaLt4H_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void DeblockLumaEq4H_c( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+
+void DeblockChromaLt4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void DeblockChromaEq4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+
+void DeblockChromaLt4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void DeblockChromaEq4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#ifdef  X86_ASM
+void DeblockLumaLt4V_sse2( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc );
+void DeblockLumaEq4V_sse2( uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+void DeblockLumaLt4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc);
+void DeblockLumaEq4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+#endif
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+void DeblockingInit( DeblockingFunc  * pFunc,  int32_t iCpu );
+
+void WelsNonZeroCount_c(int8_t * pNonZeroCount);
+void WelsBlockFuncInit(PSetNoneZeroCountZeroFunc *pfSetNZCZero,  int32_t iCpu);
+
+void PerformDeblockingFilter( sWelsEncCtx *pEnc );
+
+void DeblockingFilterFrameAvcbase( SDqLayer *pCurDq, SWelsFuncPtrList *pFunc );
+
+void DeblockingFilterSliceAvcbase( SDqLayer *pCurDq, SWelsFuncPtrList *pFunc, const int32_t kiSliceIdx );
+}
+
+#endif
+
+
--- /dev/null
+++ b/codec/encoder/core/inc/decode_mb_aux.h
@@ -1,0 +1,75 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef DECODE_MB_AUX_H
+#define DECODE_MB_AUX_H
+
+#include "typedefs.h"
+#include "macros.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+void WelsDequantLumaDc4x4(int16_t *pRes, const int32_t kiQp);
+void WelsIHadamard4x4Dc(int16_t* pRes);
+
+void WelsInitReconstructionFuncs( SWelsFuncPtrList *pList, uint32_t  iCpuFlags );
+void WelsGetEncBlockStrideOffset(int32_t *pBlock, const int32_t kiStrideY, const int32_t kiStrideUV);
+
+void WelsDequantFour4x4_c(int16_t *pRes, const uint16_t* kpQpTable);
+void WelsDequant4x4_c(int16_t *pRes, const uint16_t* kpQpTable);
+void WelsDequantIHadamard4x4_c(int16_t *pRes, const uint16_t kuiMF);
+void WelsDequantIHadamard2x2Dc( int16_t* pDct, const uint16_t kuiMF);
+
+void WelsIDctT4RecOnMb(uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct, PIDctFunc pfIDctFourT4);
+void WelsIDctT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct );
+void WelsIDctFourT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct );
+void WelsIDctRecI16x16Dc_c(uint8_t *pRec, int32_t iStride, uint8_t *pPred, int32_t iPredStride, int16_t *pDctDc);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* kpMF);
+void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* kpMF);
+void WelsDequantIHadamard4x4_sse2(int16_t *pRes, const uint16_t kuiMF);
+
+void WelsIDctT4Rec_mmx( uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct );
+void WelsIDctFourT4Rec_sse2( uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct );
+void WelsIDctRecI16x16Dc_sse2(uint8_t *pRec, int32_t iStride, uint8_t *pPrediction, int32_t iPredStride, int16_t *pDctDc);
+#endif//X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+}
+#endif
--- /dev/null
+++ b/codec/encoder/core/inc/dq_map.h
@@ -1,0 +1,56 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	dq_map.h
+ *
+ * \brief	Dependency Quality layer IDC mapping for cross layer selection and jumpping.
+ *			DQ layer idc map for svc encoding, might be a better scheme than that of design before,
+ *			can aware idc of referencing layer and that idc of successive layer to be coded
+ *
+ * \date	4/22/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_ENCODER_DEPENDENCY_QUAILITY_IDC_MAP_H__)
+#define WELS_ENCODER_DEPENDENCY_QUAILITY_IDC_MAP_H__
+
+/*
+ *	Dependency Quality IDC
+ */
+
+typedef struct TagDqIdc
+{
+	uint16_t	iPpsId;			// pPps id
+	uint8_t	iSpsId;			// pSps id
+	int8_t		uiSpatialId;	// spatial id
+}SDqIdc;
+
+#endif//WELS_ENCODER_DEPENDENCY_QUAILITY_IDC_MAP_H__
--- /dev/null
+++ b/codec/encoder/core/inc/encode_mb_aux.h
@@ -1,0 +1,134 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ENCODE_MB_AUX_H
+#define ENCODE_MB_AUX_H
+
+#include "typedefs.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+void WelsInitEncodingFuncs( SWelsFuncPtrList *pFuncList, uint32_t  uiCpuFlag );
+int32_t WelsGetNoneZeroCount_c(int16_t* pLevel);
+
+/****************************************************************************
+ * Scan and Score functions
+ ****************************************************************************/
+void	WelsScan4x4Ac_c( int16_t* pZigValue, int16_t* pDct );
+void	WelsScan4x4Dc( int16_t* pLevel, int16_t* pDct );
+void	WelsScan4x4DcAc_c( int16_t* pLevel, int16_t *pDct );
+int32_t		WelsCalculateSingleCtr4x4_c( int16_t *pDct);
+
+/****************************************************************************
+ * HDM and Quant functions 
+ ****************************************************************************/
+void WelsHadamardT4Dc_c( int16_t *pLumaDc, int16_t *pDct);
+int32_t WelsHadamardQuant2x2_c(int16_t *pRes, const int16_t kiFF, int16_t iMF, int16_t * pDct, int16_t * pBlock);
+int32_t WelsHadamardQuant2x2Skip_c(int16_t *pRes, int16_t iFF,  int16_t iMF);
+
+void WelsQuant4x4_c(int16_t *pDct, int16_t* pFF,  int16_t *pMF);
+void WelsQuant4x4Dc_c(int16_t *pDct, int16_t iFF,  int16_t iMF);
+void WelsQuantFour4x4_c(int16_t *pDct, int16_t* pFF,  int16_t *pQpTable);
+void WelsQuantFour4x4Max_c(int16_t *pDct, int16_t* pF,  int16_t *pQpTable, int16_t *pMax);
+
+
+/****************************************************************************
+ * DCT functions
+ ****************************************************************************/
+void WelsDctT4_c( int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 );
+// dct_data is no-use here, just for the same interface with dct_save functions
+void WelsDctFourT4_c(int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2);
+
+/****************************************************************************
+ * Copy functions
+ ****************************************************************************/
+void WelsCopy4x4( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
+void WelsCopy8x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
+void WelsCopy8x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );	// 
+void WelsCopy16x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );	// 
+void WelsCopy16x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#ifdef X86_ASM
+
+int32_t WelsGetNoneZeroCount_sse2(int16_t* pLevel);
+
+/****************************************************************************
+ * Scan and Score functions
+ ****************************************************************************/
+void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct );
+void WelsScan4x4DcAc_ssse3( int16_t* pLevel, int16_t *pDct );
+void WelsScan4x4DcAc_sse2( int16_t* pLevel, int16_t *pDct );
+int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
+
+/****************************************************************************
+ * DCT functions
+ ****************************************************************************/
+void WelsDctT4_mmx( int16_t *pDct,  uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 );
+void WelsDctFourT4_sse2(int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2);
+
+/****************************************************************************
+ * HDM and Quant functions 
+ ****************************************************************************/
+int32_t WelsHadamardQuant2x2_mmx(int16_t *pRes, const int16_t kiFF, int16_t iMF, int16_t * pDct, int16_t * pBlock);
+void WelsHadamardT4Dc_sse2( int16_t *pLumaDc, int16_t *pDct);
+int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pRes, int16_t iFF,  int16_t iMF);
+
+void WelsQuant4x4_sse2(int16_t *pDct, int16_t* pFF,  int16_t *pMF);
+void WelsQuant4x4Dc_sse2(int16_t *pDct,  int16_t iFF, int16_t iMF);
+void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* pFF,  int16_t *pMF);
+void WelsQuantFour4x4Max_sse2(int16_t *pDct, int16_t* pFF,  int16_t *pMF, int16_t *pMax);
+
+
+/****************************************************************************
+ * Copy functions for rec
+ ****************************************************************************/
+void WelsCopy8x8_mmx( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
+void WelsCopy8x16_mmx( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );	
+void WelsCopy16x8NotAligned_sse2( uint8_t* Dst, int32_t  iStrideD, uint8_t* Src,int32_t  iStrideS );	
+void WelsCopy16x16_sse2( uint8_t* Dst, int32_t  iStrideD, uint8_t* Src,int32_t  iStrideS );
+void WelsCopy16x16NotAligned_sse2( uint8_t* Dst, int32_t  iStrideD, uint8_t* Src,int32_t  iStrideS );
+#endif
+
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+__align16(extern int16_t, g_kiQuantInterFF[58][8] );
+#define g_iQuantIntraFF (g_kiQuantInterFF +6 )
+__align16(extern int16_t, g_kiQuantMF[52][8]) ;
+}
+#endif//ENCODE_MB_AUX_H
--- /dev/null
+++ b/codec/encoder/core/inc/encoder.h
@@ -1,0 +1,131 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	encoder.h
+ *
+ * \brief	core encoder
+ *
+ * \date	5/14/2009
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CORE_ENCODER_H__)
+#define WELS_CORE_ENCODER_H__
+
+#include "encoder_context.h"
+
+namespace WelsSVCEnc {
+/*!
+ * \brief	request specific memory for SVC
+ * \param	pEncCtx		sWelsEncCtx*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t RequestMemorySvc( sWelsEncCtx **ppCtx );
+
+/*!
+ * \brief	free memory	in SVC core encoder
+ * \param	pEncCtx		sWelsEncCtx**
+ * \return	none
+ */
+void FreeMemorySvc( sWelsEncCtx **ppCtx);
+
+/*!
+ * \brief	initialize function pointers that potentially used in Wels encoding
+ * \param	pEncCtx		sWelsEncCtx*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t InitFunctionPointers( SWelsFuncPtrList *pFuncList, SWelsSvcCodingParam *_param, uint32_t  uiCpuFlag );
+
+///*!
+// * \brief	decide frame type (IDR/P frame)	
+// * \param	uiFrameType	frame type output
+// * \param	frame_idx	frame index elapsed currently
+// * \param	idr			IDR interval
+// * \return	successful - 0; otherwise none 0 for failed
+// */
+/*!
+ * \brief	initialize frame coding	
+ */
+void InitFrameCoding( sWelsEncCtx *pEncCtx, const EFrameType keFrameType );
+
+EFrameType DecideFrameType( sWelsEncCtx *pEncCtx, const int8_t kiSpatialNum );
+/*!
+ * \brief	Dump reconstruction for dependency layer
+ */
+
+extern "C" void DumpDependencyRec( SPicture *pSrcPic, const str_t *kpFileName, const int8_t kiDid );
+
+/*!
+ * \brief	Dump the reconstruction pictures
+ */
+void DumpRecFrame( SPicture *pSrcPic, const str_t *kpFileName );
+
+
+/*!
+ * \brief	encode overall slices pData in a frame
+ * \param	pEncCtx			sWelsEncCtx*, encoder context
+ * \param	count_slice_num		count number of slices in a frame
+ * \param	eNalType			EWelsNalUnitType for a frame
+ * \param	nal_idc				EWelsNalRefIdc for a frame
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t EncodeFrame(	sWelsEncCtx *pEncCtx,
+					const int32_t kiSliceNumCount,
+					const EWelsNalUnitType keNalType,
+					const EWelsNalRefIdc keNalIdc	);
+
+
+/**********************************************************************************
+ * memzero Function 
+***********************************************************************************/
+void WelsSetMemZero_c(void *pDst, int32_t iSize);	// confirmed_safe_unsafe_usage
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#ifdef X86_ASM
+void WelsSetMemZeroAligned64_sse2(void *pDst, int32_t iSize);
+void WelsSetMemZeroSize64_mmx(void *pDst, int32_t iSize);
+void WelsSetMemZeroSize8_mmx(void *pDst, int32_t iSize);
+void WelsPrefetchZero_mmx(int8_t const*kpDst);
+#endif
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+/**********************************************************************************
+ * Function points type
+***********************************************************************************/
+}
+
+#endif//WELS_CORE_ENCODER_H__
--- /dev/null
+++ b/codec/encoder/core/inc/encoder_context.h
@@ -1,0 +1,222 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	encoder_context.h
+ *
+ * \brief	Main pData to be operated over Wels encoder all modules
+ *
+ * \date	2/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_ENCODER_CONTEXT_H__
+#define WELS_ENCODER_CONTEXT_H__
+
+#include <stdio.h>
+#include "typedefs.h"
+#include "param_svc.h"
+#include "nal_encap.h"
+#include "picture.h"
+#include "dq_map.h"
+#include "stat.h"
+#include "macros.h"
+#include "rc.h"
+#include "as264_common.h"
+#include "wels_preprocess.h"
+#include "wels_func_ptr_def.h"
+
+#ifdef MT_ENABLED
+#include "mt_defs.h"	// for multiple threadin, 
+#include "WelsThreadLib.h"
+#endif//MT_ENALBED
+
+namespace WelsSVCEnc {
+
+/*
+ *	reference list for each quality layer in SVC
+ */
+typedef struct TagRefList {
+	SPicture					*pShortRefList[1+MAX_SHORT_REF_COUNT];// reference list 0 - int16_t
+	SPicture					*pLongRefList[1+MAX_LONG_REF_COUNT];	// reference list 1 - int32_t
+	SPicture					*pNextBuffer;
+	SPicture					*pRef[1+MAX_REF_PIC_COUNT];	// plus 1 for swap intend
+	uint8_t						uiShortRefCount;
+	uint8_t						uiLongRefCount;	// dependend on pRef pic module
+} SRefList;
+
+typedef struct TagLTRState{	
+	// LTR mark feedback
+	uint32_t		    		uiLtrMarkState;	// LTR mark state, indicate whether there is a LTR mark feedback unsolved
+	int32_t						iLtrMarkFbFrameNum;// the unsolved LTR mark feedback, the marked iFrameNum feedback from decoder
+
+	// LTR used as recovery reference
+	int32_t						iLastRecoverFrameNum; // reserve the last LTR or IDR recover iFrameNum
+	int32_t						iLastCorFrameNumDec; // reserved the last correct position in decoder side, use to select valid LTR to recover or to decide the LTR mark validation
+	int32_t						iCurFrameNumInDec; // current iFrameNum in decoder side, use to select valid LTR to recover or to decide the LTR mark validation
+
+	// LTR mark
+	int32_t						iLTRMarkMode; // direct mark or delay mark
+	int32_t						iLTRMarkSuccessNum; //successful marked num, for mark mode switch
+	int32_t						iCurLtrIdx;// current int32_t term reference index to mark
+	int32_t						iLastLtrIdx;
+	uint32_t					uiLtrMarkInterval;// the interval from the last int32_t term pRef mark	
+		
+	bool_t						bLTRMarkingFlag;	//decide whether current frame marked as LTR
+	bool_t						bLTRMarkEnable; //when LTR is confirmed and the interval is no smaller than the marking period
+	bool_t						bReceivedT0LostFlag;	// indicate whether a t0 lost feedback is recieved, for LTR recovery
+}SLTRState;
+
+typedef struct TagSpatialPicIndex{
+	SPicture	*pSrc;	// I420 based and after color space converted
+	int32_t		iDid;	// dependency id
+} SSpatialPicIndex;
+
+typedef struct TagStrideTables {
+	int32_t		*pStrideDecBlockOffset[MAX_DEPENDENCY_LAYER][2];	// [iDid][tid==0][24 x 4]: luma+chroma= 24 x 4
+	int32_t		*pStrideEncBlockOffset[MAX_DEPENDENCY_LAYER];		// [iDid][24 x 4]: luma+chroma= 24 x 4
+	int16_t		*pMbIndexX[MAX_DEPENDENCY_LAYER];					// [iDid][iMbX]: map for iMbX in each spatial layer coding
+	int16_t		*pMbIndexY[MAX_DEPENDENCY_LAYER];					// [iDid][iMbY]: map for iMbY in each spatial layer coding
+} SStrideTables;
+
+typedef struct TagWelsEncCtx{
+	// Input	
+	SWelsSvcCodingParam		*pSvcParam;	// SVC parameter, WelsSVCParamConfig in svc_param_settings.h		
+	SWelsSliceBs			 *pSliceBs;		// bitstream buffering for various slices, [uiSliceIdx]	
+
+	int32_t					*pSadCostMb;
+	/* MVD cost tables for Inter MB */
+	uint16_t					*pMvdCostTableInter; //[52];	// adaptive to spatial layers
+	SMVUnitXY					*pMvUnitBlock4x4;	// (*pMvUnitBlock4x4[2])[MB_BLOCK4x4_NUM];	    // for store each 4x4 blocks' mv unit, the two swap after different d layer
+	int8_t						*pRefIndexBlock4x4;	// (*pRefIndexBlock4x4[2])[MB_BLOCK8x8_NUM];	    // for store each 4x4 blocks' pRef index, the two swap after different d layer
+	int8_t                      *pNonZeroCountBlocks;	// (*pNonZeroCountBlocks)[MB_LUMA_CHROMA_BLOCK4x4_NUM];
+	int8_t                      *pIntra4x4PredModeBlocks;	// (*pIntra4x4PredModeBlocks)[INTRA_4x4_MODE_NUM];  //last byte is not used; the first 4 byte is for the bottom 12,13,14,15 4x4 block intra mode, and 3 byte for (3,7,11)
+	
+	SMB                          **ppMbListD;	// [MAX_DEPENDENCY_LAYER];
+	SStrideTables				*pStrideTab;	// stride tables for internal coding used
+	SWelsFuncPtrList			*pFuncList;
+
+#if defined(MT_ENABLED)
+	SSliceThreading				*pSliceThreading;
+#endif//MT_ENABLED
+
+	// SSlice context
+	SSliceCtx				*pSliceCtxList;// slice context table for each dependency quality layer
+	// pointers
+	SPicture					*pEncPic;			// pointer to current picture to be encoded
+	SPicture					*pDecPic;			// pointer to current picture being reconstructed
+	SPicture					*pRefPic;			// pointer to current reference picture	
+
+	SDqLayer					*pCurDqLayer;				// DQ layer context used to being encoded currently, for reference base layer to refer: pCurDqLayer->pRefLayer if applicable	
+	SDqLayer					**ppDqLayerList;			// overall DQ layers encoded for storage	
+
+	SRefList					**ppRefPicListExt;		// reference picture list for SVC
+	SPicture					*pRefList0[16];	
+	SLTRState					*pLtr;//[MAX_DEPENDENCY_LAYER];	
+	
+	// Derived
+	int32_t						iCodingIndex;
+	int32_t						iFrameIndex;			// count how many frames elapsed during coding context currently
+	uint32_t					uiFrameIdxRc;           //only for RC
+	int32_t						iFrameNum;				// current frame number coding
+	int32_t						iPOC;					// frame iPOC
+	EWelsSliceType				eSliceType;			// currently coding slice type
+	EWelsNalUnitType			eNalType;			// NAL type
+	EWelsNalRefIdc				eNalPriority;		// NAL_Reference_Idc currently
+	EWelsNalRefIdc				eLastNalPriority;	// NAL_Reference_Idc in last frame		
+	uint8_t						iNumRef0;	
+
+	uint8_t						uiDependencyId;	// Idc of dependecy layer to be coded
+	uint8_t						uiTemporalId;	// Idc of temporal layer to be coded
+	bool_t						bNeedPrefixNalFlag;	// whether add prefix nal	
+	bool_t                      bEncCurFrmAsIdrFlag;  
+
+	// Rate control routine	
+	SWelsSvcRc					*pWelsSvcRc;
+	int32_t						iSkipFrameFlag; //_GOM_RC_
+	int32_t						iGlobalQp;		// global qp
+
+	// VAA	
+	SVAAFrameInfo			    *pVaa;		    // VAA information of reference
+	CWelsPreProcess				*pVpp;	
+
+	SWelsSPS							*pSpsArray;		// MAX_SPS_COUNT by standard compatible
+	SWelsSPS							*pSps;
+	SWelsPPS							*pPPSArray;		// MAX_PPS_COUNT by standard compatible
+	SWelsPPS							*pPps;
+	/* SVC only */
+	SSubsetSps					*pSubsetArray;	// MAX_SPS_COUNT by standard compatible
+	SSubsetSps					*pSubsetSps;
+	int32_t						iSpsNum;	// number of pSps used
+	int32_t						iPpsNum;	// number of pPps used
+
+	// Output
+	SWelsEncoderOutput			*pOut;			// for NAL raw pData (need allocating memory for sNalList internal)
+	uint8_t						*pFrameBs;		// restoring bitstream pBuffer of all NALs in a frame
+	int32_t						iFrameBsSize;	// count size of frame bs in bytes allocated
+	int32_t						iPosBsBuffer;	// current writing position of frame bs pBuffer
+	
+	/* For Downsampling & VAA I420 based source pictures */	
+	SPicture					*pSpatialPic[MAX_DEPENDENCY_LAYER][MAX_TEMPORAL_LEVEL+1+LONG_TERM_REF_NUM];	// need memory requirement with total number of (log2(uiGopSize)+1+1+long_term_ref_num)
+
+	SSpatialPicIndex			sSpatialIndexMap[MAX_DEPENDENCY_LAYER];
+	uint8_t						uiSpatialLayersInTemporal[MAX_DEPENDENCY_LAYER];
+
+	uint8_t                     uiSpatialPicNum[MAX_DEPENDENCY_LAYER];
+    bool_t						bLongTermRefFlag[MAX_DEPENDENCY_LAYER][MAX_TEMPORAL_LEVEL+1/*+LONG_TERM_REF_NUM*/];
+
+	int16_t						iMaxSliceCount;// maximal count number of slices for all layers observation
+	int16_t						iActiveThreadsNum;	// number of threads active so far
+	
+	/*
+	 * DQ layer idc map for svc encoding, might be a better scheme than that of design before,
+	 * can aware idc of referencing layer and that idc of successive layer to be coded
+	 */
+	/* SVC only */
+	SDqIdc						*pDqIdcMap;	// overall DQ map of full scalability in specific frame (All full D/T/Q layers involved)												// pDqIdcMap[dq_index] for each SDqIdc pData	
+
+	SParaSetOffset				sPSOVector;	
+	CMemoryAlign				*pMemAlign;
+
+#ifdef ENABLE_TRACE_FILE
+	FILE						*pFileLog;		// log file for wels encoder
+	uint32_t					uiSizeLog;		// size of log have been written in file
+
+#endif//ENABLE_TRACE_FILE
+
+#if defined(STAT_OUTPUT)	
+	// overall stat pData, refer to SStatData in stat.h, in case avc to use stat[0][0]
+	SStatData					sStatData [ MAX_DEPENDENCY_LAYER ] [ MAX_QUALITY_LEVEL ];
+	SStatSliceInfo				sPerInfo;
+#endif//STAT_OUTPUT	
+
+}sWelsEncCtx/*, *PWelsEncCtx*/;
+}
+#endif//sWelsEncCtx_H__
--- /dev/null
+++ b/codec/encoder/core/inc/expand_pic.h
@@ -1,0 +1,76 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file		expand_pic.h
+ *
+ * \brief		Interface for expanding reconstructed picture to be used for reference
+ *
+ * \date		06/08/2009
+ *************************************************************************************
+ */
+
+#ifndef EXPAND_PIC_H
+#define EXPAND_PIC_H
+
+#include "typedefs.h"
+#include "picture.h"
+
+namespace WelsSVCEnc {
+typedef void (*PExpandPictureFunc)( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH );
+
+void ExpandReferencingPicture( SPicture *pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChrom[2] );
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+void ExpandPictureLuma_sse2(	uint8_t *pDst,
+								const int32_t kiStride,
+								const int32_t kiPicW,
+								const int32_t kiPicH	);
+void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
+									const int32_t kiStride,
+									const int32_t kiPicW,
+									const int32_t kiPicH	);
+void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
+									const int32_t kiStride,
+									const int32_t kiPicW,
+									const int32_t kiPicH	);
+#endif//X86_ASM
+	
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+void InitExpandPictureFunc( void *pL, const uint32_t kuiCPUFlags );
+}
+#endif
--- /dev/null
+++ b/codec/encoder/core/inc/extern.h
@@ -1,0 +1,119 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	extern.h
+ *
+ * \brief	extern interfaces between core and plus of wels encoder
+ *
+ * \date	4/21/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_ENCODER_EXTERN_H__)
+#define WELS_ENCODER_EXTERN_H__
+
+#include "typedefs.h"
+#include "encoder_context.h"
+
+namespace WelsSVCEnc {
+
+//#pragma pack()
+
+/*!
+ * \brief	initialize source picture body
+ * \param	kpSrc		SSourcePicture*
+ * \param	kiCsp		internal csp format
+ * \param	kiWidth	widht of picture in pixels
+ * \param	kiHeight	height of picture in pixels
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t InitPic( const void *kpSrc, const int32_t kiCsp, const int32_t kiWidth, const int32_t kiHeight );
+
+/*
+ *	SVC core encoder external interfaces
+ */
+
+/*!
+ * \brief	validate checking in parameter configuration
+ * \pParam	pParam		SWelsSvcCodingParam*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t ParamValidationExt( void *pParam );
+
+// GOM based RC related for uiSliceNum decision
+void GomValidCheck(const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t *pSliceNum);
+
+/*!
+ * \brief	initialize Wels avc encoder core library
+ * \param	ppCtx		sWelsEncCtx**
+ * \param	para		SWelsSvcCodingParam*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t WelsInitEncoderExt( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pPara );
+
+/*!
+ * \brief	uninitialize Wels encoder core library
+ * \param	pEncCtx		sWelsEncCtx*
+ * \return	none
+ */
+void WelsUninitEncoderExt( sWelsEncCtx **ppCtx );
+
+/*!
+ * \brief	core svc encoding process
+ *
+ * \param	h			sWelsEncCtx*, encoder context
+ * \param	dst			FrameBSInfo*
+ * \param	pSrc			SSourcePicture* for need_ds = true or SSourcePicture** for need_ds = false
+ * \param	kiConfiguredLayerNum	=1 in case need_ds = true or >1 in case need_ds = false
+ * \param	need_ds		Indicate whether need down sampling desired
+ *						[NO in picture list case, YES in console aplication based]
+ * \return	EFrameType (WELS_FRAME_TYPE_IDR/WELS_FRAME_TYPE_I/WELS_FRAME_TYPE_P)
+ */
+int32_t WelsEncoderEncodeExt( sWelsEncCtx *, void *pDst, const SSourcePicture **kppSrcList, const int32_t kiConfiguredLayerNum );
+
+/*
+ * Force coding IDR as follows
+ */
+int32_t ForceCodingIDR( sWelsEncCtx *pCtx );
+
+/*!
+ * \brief	Wels SVC encoder parameters adjustment
+ *			SVC adjustment results in new requirement in memory blocks adjustment
+ */
+int32_t WelsEncoderParamAdjust( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pNew );
+
+int32_t FilterLTRRecoveryRequest(sWelsEncCtx *pCtx,SLTRRecoverRequest* pLTRRecoverRequest);
+
+void FilterLTRMarkingFeedback(sWelsEncCtx *pCtx,SLTRMarkingFeedback* pLTRMarkingFeedback);
+}
+
+#endif//WELS_ENCODER_CALLBACK_H__
+
--- /dev/null
+++ b/codec/encoder/core/inc/get_intra_predictor.h
@@ -1,0 +1,129 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	get_intra_predictor.h
+ *
+ * \brief	interfaces for get intra predictor about 16x16, 4x4, chroma.
+ *
+ * \date	4/2/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef GET_INTRA_PREDICTOR_H
+#define GET_INTRA_PREDICTOR_H
+
+#include "typedefs.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+void WelsI4x4LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef,  const int32_t kiStride);
+void WelsI4x4LumaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+
+void WelsI4x4LumaPredDDL_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDLTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDR_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+
+void WelsI4x4LumaPredVR_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHD_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVL_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVLTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHU_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+
+
+void WelsIChormaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChormaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChormaPredPlane_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChormaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChormaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChormaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChormaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+
+void WelsI16x16ChormaPredVer(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16ChormaPredHor(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+
+void WelsI16x16LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredPlane_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+void WelsFillingPred8to16_mmx( uint8_t *pPred, uint8_t *pValue );
+void WelsFillingPred8x2to16_mmx( uint8_t *pPred, uint8_t *pValue );
+void WelsFillingPred1to16_mmx( uint8_t *pPred, const uint8_t kuiValue );
+void WelsFillingPred8x2to16_sse2( uint8_t *pPred, uint8_t *pValue );
+void WelsFillingPred1to16_sse2( uint8_t *pPred, const uint8_t kuiValue );
+
+//for intra-prediction ASM functions
+void WelsI16x16LumaPredV_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredH_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredDc_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+
+void WelsIChromaPredH_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChromaPredV_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChromaPredDc_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsIChromaPredPlane_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+
+void WelsI4x4LumaPredV_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredH_sse2(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDc_sse2(uint8_t *pPred,uint8_t *pRef,const int32_t kiStride);
+void WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredDDR_mmx(uint8_t *pPred,uint8_t *pRef,const int32_t kiStride);
+void WelsI4x4LumaPredVR_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHD_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredVL_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+void WelsI4x4LumaPredHU_mmx(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride);
+#endif//X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+void WelsInitFillingPredFuncs( const uint32_t kuiCpuFlag );
+void WelsInitIntraPredFuncs( SWelsFuncPtrList *pFuncList, const uint32_t kuiCpuFlag );
+
+//#pragma pack()
+}
+#endif //GET_INTRA_PREDICTOR_H
+
+
--- /dev/null
+++ b/codec/encoder/core/inc/ls_defines.h
@@ -1,0 +1,97 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ___LD_ST_MACROS___
+#define ___LD_ST_MACROS___
+
+#include <string.h>
+#include "typedefs.h"
+
+#ifdef __GNUC__
+
+	struct tagUnaligned_64 { uint64_t l; } __attribute__((packed));
+	struct tagUnaligned_32 { uint32_t l; } __attribute__((packed));
+	struct tagUnaligned_16 { uint16_t l; } __attribute__((packed));
+	
+	#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
+	#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
+	//#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
+        inline uint64_t LD64(const void * a)
+		{
+			uint64_t v;
+			memcpy(&v, a, sizeof(v));	// confirmed_safe_unsafe_usage
+			return v;
+		}
+	//#define _USE_STRUCT_INT_CVT
+//	#ifdef _USE_STRUCT_INT_CVT
+		#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)             
+		#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
+		//#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)                               
+                inline void ST64(void * a, uint64_t b)
+				{
+					memcpy(a, &b, sizeof(b));	// confirmed_safe_unsafe_usage
+				}
+//	#else
+//		inline void __ST16(void *dst, uint16_t v) { memcpy(dst, &v, 2); }
+//		inline void __ST32(void *dst, uint32_t v) { memcpy(dst, &v, 4); }
+		//inline void __ST64(void *dst, uint64_t v) { memcpy(dst, &v, 8); }
+//	#endif
+
+#else
+	
+//#define INTD16(a) (*((int16_t*)(a)))
+//#define INTD32(a) (*((int32_t*)(a)))
+//#define INTD64(a) (*((int64_t*)(a)))
+
+#define LD16(a) (*((uint16_t*)(a)))
+#define LD32(a) (*((uint32_t*)(a)))
+#define LD64(a) (*((uint64_t*)(a)))
+
+#define ST16(a, b) *((uint16_t*)(a)) = (b)
+#define ST32(a, b) *((uint32_t*)(a)) = (b)
+#define ST64(a, b) *((uint64_t*)(a)) = (b)
+
+#endif /* !__GNUC__ */
+
+#ifndef INTD16
+#define INTD16	LD16
+#endif//INTD16
+
+#ifndef INTD32
+#define INTD32	LD32
+#endif//INTD32
+
+#ifndef INTD64
+#define INTD64	LD64
+#endif//INTD64
+
+#endif
--- /dev/null
+++ b/codec/encoder/core/inc/macros.h
@@ -1,0 +1,417 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	macros.h
+ *
+ * \brief	MACRO based tool utilization
+ *
+ * \date	3/13/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_MACRO_UTILIZATIONS_H__
+#define WELS_MACRO_UTILIZATIONS_H__
+
+//#include <math.h>
+#include "typedefs.h"
+
+namespace WelsSVCEnc {
+#if defined(_MSC_VER)
+	#if _MSC_VER <= 1200
+		#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
+			type name##_storage[size+(alignment)-1]; \
+			type * name = (type *) (((int32_t) name##_storage+(alignment - 1)) & ~((int32_t)(alignment)-1))
+
+		#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
+			type name##_storage[(sizex)*(sizey)+(alignment)-1]; \
+			type * name = (type *) (((int32_t) name##_storage+(alignment - 1)) & ~((int32_t)(alignment)-1))
+	#else //_MSC_VER <= 1200
+		#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
+			__declspec(align(alignment)) type name[size]
+
+		#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
+		__declspec(align(alignment)) type name[(sizex)*(sizey)]
+	#endif//_MSC_VER <= 1200
+
+#elif defined(__GNUC__)
+
+	#define ALLOC_ALLIGN_MEM(name,size,type,alignment) \
+		type name[size] __attribute__((aligned(alignment)))
+	#define ALLOC_ALLIGN_MEM_2(name,sizex,sizey,type,alignment) \
+		type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))
+		
+#endif//_MSC_VER
+
+
+#if defined(_MSC_VER)
+
+	#if(_MSC_VER < 1700)
+	#define inline	__inline  
+	#endif
+
+    #define __FASTCALL   __fastcall
+	#define ALIGNED_DECLARE( type, var, n ) __declspec(align(n)) type var
+	#define __align8(t,v) __declspec(align(8)) t v
+	#define __align16(t,v) __declspec(align(16)) t v
+#elif defined(__GNUC__)
+#if !defined(MAC_POWERPC)
+    #define __FASTCALL    __attribute__ ((fastcall))
+#else
+	#define __FASTCALL	// mean NULL for mac ppc
+#endif//MAC_POWERPC    
+	#define ALIGNED_DECLARE( type, var, n ) type var __attribute__((aligned(n)))
+	#define __align8(t,v) t v __attribute__ ((aligned (8)))
+	#define __align16(t,v) t v __attribute__ ((aligned (16)))
+#endif//_MSC_VER
+
+#if defined(_MACH_PLATFORM) || defined(__GNUC__)
+#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
+	type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))	
+#else //_MSC_VER <= 1200
+#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
+__declspec(align(alignment)) type name[(sizex)*(sizey)]
+#endif//#if _MACH_PLATFORM
+
+#if defined(_MACH_PLATFORM) || defined(__GNUC__)
+#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
+	type name[size] __attribute__((aligned(alignment)))	
+#else //_MSC_VER <= 1200
+#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
+	__declspec(align(alignment)) type name[(size)]
+#endif//#if _MACH_PLATFORM
+
+//#if !defined(SIZEOFRGB24)
+//#define SIZEOFRGB24(cx, cy)	(3 * (cx) * (cy))
+//#endif//SIZEOFRGB24
+
+//#if !defined(SIZEOFRGB32)
+//#define SIZEOFRGB32(cx, cy)	(4 * (cx) * (cy))
+//#endif//SIZEOFRGB32
+
+#ifndef	WELS_ALIGN
+#define WELS_ALIGN(x, n)	(((x)+(n)-1)&~((n)-1))
+#endif//WELS_ALIGN
+
+#ifndef WELS_MAX
+//#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
+//#define WELS_MAX(x, y)	((x) - (((x)-(y))&(((x)-(y))>>31)))
+#define WELS_MAX(x, y)	((x) ^ (((x)^(y))& -((x)<(y))))		// WELS_MAX(x, y)
+#endif//WELS_MAX
+
+#ifndef WELS_MIN
+//#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
+//#define WELS_MIN(x, y)	((y) + (((x)-(y))&(((x)-(y))>>31)))
+#define WELS_MIN(x, y)	((y) ^ (((x)^(y))& -((x)<(y))))		// WELS_MIN(x, y)
+#endif//WELS_MIN
+
+#ifndef WELS_ROUND
+#define WELS_ROUND(x)	((int32_t)((x)+0.5f+EPSN))
+#endif//WELS_ROUND
+
+static inline int32_t WELS_CEIL(float v)
+{
+	const int32_t n = (int32_t)v;	// floor value
+	return ((v>EPSN+n) ? (1+n) : n);	// (int32_t)ceil(v);
+}
+
+static inline int32_t WELS_FLOOR(float v)
+{
+	return (int32_t)v;		
+}
+
+
+#define WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB) {	\
+    iC = iA + iB + 1;                           \
+	iC >>= (int32_t)( iA != -1 && iB != -1);    \
+	iC += (iA == -1 && iB == -1);               \
+}    
+
+/*
+ * log base 2 of v and ceil/floor extension
+ */
+
+static inline int32_t WELS_CEILLOG2( uint32_t v )
+{
+	int32_t r = 0;
+	--v;
+	while( v > 0 )
+	{
+		++r;
+		v >>= 1;
+	}
+	return r;
+}
+
+static inline int32_t WELS_FLOORLOG2( uint32_t v )
+{	
+	int32_t r = 0;
+	while( v > 1 )
+	{
+		++r;
+		v >>= 1;
+	}
+	return r;
+}
+
+static inline int32_t WELS_LOG2( uint32_t v )
+{	
+	int32_t r = 0;
+	while (v >>= 1)
+	{
+  		++r;
+	}
+	return r;
+
+}
+
+static inline BOOL_T WELS_POWER2_IF( uint32_t v )
+{
+	return ( v && !(v & (v - 1)) );
+}
+
+static inline int32_t WELS_MEDIAN(int32_t x,  int32_t y, int32_t z)
+{
+	int32_t t = (x-y)&((x-y)>>31);
+	x -= t;
+	y += t;
+	y -= (y-z)&((y-z)>>31);
+	y += (x-y)&((x-y)>>31);
+	return y;
+}
+
+#ifndef BUTTERFLY1x2
+#define BUTTERFLY1x2(b) (((b)<<8) | (b))
+#endif//BUTTERFLY1x2
+
+#ifndef BUTTERFLY2x4
+#define BUTTERFLY2x4(wd) (((uint32_t)(wd)<<16) |(wd))
+#endif//BUTTERFLY2x4
+
+#ifndef BUTTERFLY4x8
+#define BUTTERFLY4x8(dw) (((uint64_t)(dw)<<32) | (dw))
+#endif//BUTTERFLY4x8
+
+//when RS accumulation, should clip rs among range of [-255, 255]
+#ifndef CLIP_RS
+#define CLIP_RS( value ) ( WELS_MAX( WELS_MIN( value, 255 ), -255 ) )
+#endif //CLIP_RS
+
+//#ifndef NEG_NUM
+//#define NEG_NUM( num ) (1+(~(num)))
+//#endif// NEG_NUM
+
+#ifndef WELS_CLIP1
+#define WELS_CLIP1(x) (((x) & ~255) ? (-(x) >> 31) : (x)) 
+#endif//WELS_CLIP1
+
+#ifndef WELS_SIGN
+#define WELS_SIGN(a) ((int32_t)(a) >> 31)	// General: (a)>>(sizeof(int)*CHAR_BIT-1), CHAR_BIT= the number of bits per byte (normally 8)
+#endif //WELS_SIGN
+
+static inline int32_t WELS_ABS(int32_t a)
+{
+	const int32_t sign = WELS_SIGN(a);
+	return ((a + sign) ^ sign);
+}
+
+// wels_tostring
+//#ifndef wels_tostring
+//#define wels_tostring(s)	#s
+//#endif //wels_tostring
+
+// WELS_CLIP3
+#ifndef WELS_CLIP3
+#define WELS_CLIP3(x, y, z)		((x) < (y) ? (y) : ((x) > (z) ? (z) : (x)))
+#endif //WELS_CLIP3
+
+#define CLIP3_QP_0_51(q)		WELS_CLIP3(q, 0, 51)	// ((q) < (0) ? (0) : ((q) > (51) ? (51) : (q)))
+
+// Bitwise routines
+// n: ulong
+// b: bit order
+static inline bool_t BITWISE_ENABLED(const uint32_t n, const uint8_t b)
+{
+	const uint8_t bit = (b&0x1f);	// maximal bit position 31 for uint32_t 4 bytes
+#if defined(WORDS_BIGENDIAN)
+	/* 
+	 * 31 .. 24, 23 .. 16, 15 .. 8, 7 .. 0
+	 * 7 .. 0, 15 .. 8, 23 .. 16, 31 .. 24
+	 */	
+	const uint8_t map = 24+((bit&7)<<1)-bit;	// BIG_ENDIAN map
+	return (bool_t)((n & (1<<map)) >> map);	// BIG_ENDIAN
+#else
+	return ((n & (1<<bit)) >> bit)?true:false;	// LITTLE_ENDIAN
+#endif//WORDS_BIGENDIAN
+}
+
+#define   CALC_BI_STRIDE(width,bitcount)  ((((width * bitcount) + 31) & ~31) >> 3)
+
+//////////////////////////////////////////////////////////
+
+#ifdef    WORDS_BIGENDIAN
+
+static inline uint32_t ENDIAN_FIX(uint32_t x)
+{
+    return x;
+}
+
+#else 
+
+
+#ifdef    _MSC_VER
+static inline uint32_t ENDIAN_FIX(uint32_t x)
+{
+    __asm
+    {
+        mov   eax,  x
+		bswap   eax
+		mov   x,    eax
+    }
+    return x;
+}
+#else  // GCC
+static inline uint32_t ENDIAN_FIX(uint32_t x)
+{
+#ifdef X86_ARCH
+	__asm__ __volatile__("bswap %0":"+r"(x));
+#else
+    x = ((x & 0xff000000)>> 24) | ((x & 0xff0000) >> 8) |
+        ((x & 0xff00) << 8) | ((x&0xff) << 24);
+#endif
+	return x;
+}
+
+
+#endif
+
+#endif
+
+// wels_swap16
+
+// wels_swap32
+
+// sad, satd, avg might being in other header
+
+/*
+ * Description: to check variable validation and return the specified result
+ *	result:		value to be return
+ *	case_if:	negative condition to be verified
+ */
+#ifndef WELS_VERIFY_RETURN_IF
+#define WELS_VERIFY_RETURN_IF(result, case_if) \
+	if ( case_if ){ \
+		return result; \
+	}
+#endif//#if WELS_VERIFY_RETURN_IF
+
+/*
+ *	Description: to check variable validation and return the specified result 
+ *		with correspoinding process advance.
+ *	 result:	value to be return
+ *	 case_if:	negative condition to be verified
+ *	 proc:		process need perform
+ */
+#ifndef WELS_VERIFY_RETURN_PROC_IF
+#define WELS_VERIFY_RETURN_PROC_IF(result, case_if, proc) \
+	if ( case_if ){ \
+		proc;	\
+		return result;	\
+	}
+#endif//#if WELS_VERIFY_RETURN_PROC_IF
+
+/*
+ * Description:	to check variable validation and return
+ *	case_if:	negtive condition to be verified
+ *	return:		NONE
+ */
+#ifndef WELS_VERIFY_IF
+#define WELS_VERIFY_IF(case_if) \
+	if ( case_if ){ \
+		return; \
+	}
+#endif//#if WELS_VERIFY_IF
+
+/*
+ * Description:	to check variable validation and return with correspoinding process advance.
+ *	case_if:	negtive condition to be verified
+ *	proc:		process need preform
+ *	return:		NONE
+ */
+#ifndef WELS_VERIFY_PROC_IF
+#define WELS_VERIFY_PROC_IF(case_if, proc) \
+	if ( case_if ){ \
+		proc; \
+		return; \
+	}
+#endif//#if WELS_VERIFY_IF
+
+/*
+ * Description: to safe free a ptr with free function pointer
+ *  p:			pointer to be destroyed
+ *	free_fn:	free function pointer used
+ */
+#ifndef WELS_SAFE_FREE_P
+#define WELS_SAFE_FREE_P(p, free_fn) \
+	do{ \
+		if ( NULL != (p) ){ \
+			free_fn( (p) ); \
+			(p) = NULL; \
+		} \
+	}while( 0 );
+#endif//#if WELS_SAFE_FREE_P
+
+/*
+ * Description: to safe free an array ptr with free function pointer
+ *	arr:		pointer to an array, something like "**p";
+ *	num:		number of elements in array
+ *  free_fn:	free function pointer	
+ */
+#ifndef WELS_SAFE_FREE_ARR
+#define WELS_SAFE_FREE_ARR(arr, num, free_fn) \
+	do{ \
+		if ( NULL != (arr) ){ \
+			int32_t iidx = 0; \
+			while( iidx < num ){ \
+				if ( NULL != (arr)[iidx] ){ \
+					free_fn( (arr)[iidx] ); \
+					(arr)[iidx] = NULL; \
+				} \
+				++ iidx; \
+			} \
+			free_fn((arr)); \
+			(arr) = NULL; \
+		} \
+	}while( 0 );
+#endif//#if WELS_SAFE_FREE_ARR
+
+}
+
+#endif//WELS_MACRO_UTILIZATIONS_H__
--- /dev/null
+++ b/codec/encoder/core/inc/mb_cache.h
@@ -1,0 +1,148 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//mb_cache.h
+#ifndef WELS_MACROBLOCK_CACHE_H__
+#define WELS_MACROBLOCK_CACHE_H__
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "wels_const.h"
+#include "macros.h"
+
+namespace WelsSVCEnc {
+//#pragma pack(1)
+
+/*
+ *	MB Cache information, such one cache should be defined within a slice
+ */
+/*
+ * Cache for Luma				Cache for Chroma(Cb, Cr)
+ *	
+ *	TL T T T T					TL T T
+ *	 L - - - -					 L - -
+ *	 L - - - -					 L - - TR
+ *	 L - - - -
+ *   L - - - - TR
+ *
+ */
+
+////////////////////////mapping scan index////////////////////////
+
+extern const uint8_t g_kuiSmb4AddrIn256[16];
+extern const uint8_t g_kuiMbCountScan4Idx[24];
+extern const uint8_t g_kuiCache30ScanIdx[16];
+extern const uint8_t g_kuiCache12_8x8RefIdx[4];
+extern const uint8_t g_kuiCache48CountScan4Idx[24];
+
+typedef	struct TagDCTCoeff
+{
+	//ALIGNED_DECLARE( int16_t, residual_ac[16], 16 ); //I_16x16 
+	int16_t iLumaBlock[16][16]; //based on block4x4 luma DC/AC
+	//ALIGNED_DECLARE( int16_t, iLumaI16x16Dc[16], 16 ); //I_16x16 DC
+	int16_t iLumaI16x16Dc[16];
+	//ALIGNED_DECLARE( int16_t, iChromaDc[2][4], 16 ); //chroma DC
+	int16_t iChromaBlock[8][16]; //based on block4x4  chroma DC/AC
+	int16_t iChromaDc[2][4];
+}SDCTCoeff ;
+
+typedef struct TagMbCache{
+	//the followed pData now is promised aligned to 16 bytes
+	ALIGNED_DECLARE(SMVComponentUnit, sMvComponents, 16);
+	
+	ALIGNED_DECLARE_MATRIX_1D(iNonZeroCoeffCount, 48, int8_t, 16);	// Cache line size
+	// 	int8_t		iNonZeroCoeffCount[6 * 8];	// Right luma, Chroma(Left Top Cb, Left btm Cr); must follow by iIntraPredMode!
+	ALIGNED_DECLARE_MATRIX_1D(iIntraPredMode, 48, int8_t, 16);	
+	//	must follow with iNonZeroCoeffCount! 
+	
+	int32_t     iSadCost[4];			//avail 1; unavail 0
+	SMVUnitXY  sMbMvp[MB_BLOCK8x8_NUM];// for write bs
+
+	//for residual decoding (recovery) at the side of Encoder
+	int16_t *pCoeffLevel;		// tmep
+	//malloc memory for prediction
+	uint8_t* pSkipMb;	
+
+	//ALIGNED_DECLARE(uint8_t, pMemPredMb[2][256],  16);//One: Best I_16x16 Luma and refine frac_pixel pBuffer; another: PingPong I_8x8&&Inter Cb + Cr
+	uint8_t *pMemPredMb;
+	uint8_t* pMemPredLuma;// inter && intra share same pointer; 
+	//ALIGNED_DECLARE(uint8_t, pMemPredChroma[2][64*2], 16); //another PingPong pBuffer: Best Cb + Cr; 
+	uint8_t *pMemPredChroma;// inter && intra share same pointer;
+	uint8_t* pBestPredIntraChroma; //Cb:0~63;   Cr:64~127
+
+	//ALIGNED_DECLARE(uint8_t, pMemPredBlk4[2][16], 16); //I_4x4
+	uint8_t *pMemPredBlk4;		
+
+	uint8_t* pBestPredI4x4Blk4;//I_4x4
+
+	//ALIGNED_DECLARE(uint8_t, pBufferInterPredMe[4][400], 16);//inter type pBuffer for ME h & v & hv
+	uint8_t *pBufferInterPredMe;    // [4][400] is enough because only h&v or v&hv or h&hv. but if both h&v&hv is needed when 8 quart pixel, future we have to use [5][400].
+
+	//no scan4[] order, just as memory order to store
+	//ALIGNED_DECLARE(bool_t, pPrevIntra4x4PredModeFlag[16], 16);//if 1, means no rem_intra4x4_pred_mode; if 0, means rem_intra4x4_pred_mode != 0
+	bool_t *pPrevIntra4x4PredModeFlag;
+	//ALIGNED_DECLARE(int8_t, pRemIntra4x4PredModeFlag[16], 16);//-1 as default; if pPrevIntra4x4PredModeFlag==0, 
+	//pRemIntra4x4PredModeFlag or added by 1 is the best pred_mode
+	int8_t *pRemIntra4x4PredModeFlag;
+
+	int32_t     iSadCostSkip[4];	     //avail 1; unavail 0
+	bool_t      bMbTypeSkip[4];         //1: skip; 0: non-skip  
+	int32_t     *pEncSad;
+
+	//for residual encoding at the side of Encoder
+	SDCTCoeff *pDct;
+
+	uint8_t      uiNeighborIntra; // LEFT_MB_POS:0x01, TOP_MB_POS:0x02, TOPLEFT_MB_POS = 0x04 ,TOPRIGHT_MB_POS = 0x08;
+	uint8_t uiLumaI16x16Mode;
+	uint8_t uiChmaI8x8Mode;
+
+	bool_t		bCollocatedPredFlag;//denote if current MB is collocated predicted (MV==0).
+	uint32_t	uiRefMbType;
+
+	struct
+	{
+		/* pointer of current mb location in original frame */
+		uint8_t *pEncMb[3];		
+		/* pointer of current mb location in recovery frame */
+		uint8_t *pDecMb[3];		
+		/* pointer of co-located mb location in reference frame */
+		uint8_t *pRefMb[3];	
+		//for SVC
+		uint8_t	*pCsMb[3];//locating current mb's CS in whole frame
+//		int16_t *p_rs[3];//locating current mb's RS	in whole frame
+
+	} SPicData;
+}SMbCache;
+
+}//end of namespace
+
+#endif//WELS_MACROBLOCK_CACHE_H__
--- /dev/null
+++ b/codec/encoder/core/inc/mc.h
@@ -1,0 +1,86 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//macroblock.h
+#ifndef WELS_MC_H__
+#define WELS_MC_H__
+
+#include <string.h>
+#include "typedefs.h"
+#include "wels_const.h"
+#include "macros.h"
+#include "wels_func_ptr_def.h"
+
+/////////////////////luma MC////////////////////////// 
+//x y means dx(mv[0] & 3) and dy(mv[1] & 3)
+
+namespace WelsSVCEnc {
+void WelsInitMcFuncs( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag );
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+//***************************************************************************//
+//                       MMXEXT and SSE2 definition                          //
+//***************************************************************************//
+#if defined(X86_ASM)
+void McChromaWidthEq4_mmx( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD,int32_t iHeigh );
+void McCopyWidthEq4_mmx ( uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
+void McCopyWidthEq8_mmx( uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
+void PixelAvgWidthEq8_mmx( uint8_t *,  int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t  );
+
+void McHorVer20_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,int32_t iWidth, int32_t iHeight);
+void McHorVer02_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,int32_t iWidth, int32_t iHeight);
+void McHorVer22HorFirst_sse2(uint8_t * pSrc,int32_t iSrcStride,uint8_t * pTap,int32_t iTapStride,int32_t iWidth,int32_t iHeight);	
+void McHorVer22VerLastAlign_sse2(uint8_t * pTap, int32_t iTapStride, uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight);
+void McHorVer22VerLastUnAlign_sse2(uint8_t * pTap, int32_t iTapStride, uint8_t * pDst,int32_t iDstStride,int32_t iWidth,int32_t iHeight);
+void McChromaWidthEq8_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD, int32_t iHeigh );
+void McCopyWidthEq16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
+void McHorVer20WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight );
+void McHorVer02WidthEq8_sse2(uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc, int32_t iSrcStride, uint8_t* pTap,	int32_t iTapStride,int32_t iHeight);
+void PixelAvgWidthEq16_sse2( uint8_t *,  int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t  );
+
+
+void PixelAvgWidthEq16_ssse3( uint8_t *,  int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t  );
+void McChromaWidthEq8_ssse3( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *kpABCD, int32_t iHeigh );
+
+
+#endif //X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+}
+#endif//WELS_MC_H__
--- /dev/null
+++ b/codec/encoder/core/inc/md.h
@@ -1,0 +1,168 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	md.h
+ *
+ * \brief	mode decision 
+ *
+ * \date	2009.5.14 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_MACROBLOCK_MODE_DECISION_H__
+#define WELS_MACROBLOCK_MODE_DECISION_H__
+
+#include "svc_motion_estimate.h"
+#include "svc_enc_macroblock.h"
+#include "encode_mb_aux.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+#define ME_REFINE_BUF_STRIDE       32
+#define ME_REFINE_BUF_WIDTH_BLK4   8
+#define ME_REFINE_BUF_WIDTH_BLK8   16
+#define ME_REFINE_BUF_STRIDE_BLK4  160
+#define ME_REFINE_BUF_STRIDE_BLK8  320
+	
+#define REFINE_ME_NO_BEST_HALF_PIXEL 0 //( 0,  0)
+#define REFINE_ME_HALF_PIXEL_LEFT    3 //(-2,  0)
+#define REFINE_ME_HALF_PIXEL_RIGHT   4 //( 2,  0)
+#define REFINE_ME_HALF_PIXEL_TOP     1 //( 0, -2)
+#define REFINE_ME_HALF_PIXEL_BOTTOM  2 //( 0,  2)
+	
+#define ME_NO_BEST_QUAR_PIXEL 1 //( 0,  0) or best half pixel
+#define ME_QUAR_PIXEL_LEFT    2 //(-1,  0)
+#define ME_QUAR_PIXEL_RIGHT   3 //( 1,  0)
+#define ME_QUAR_PIXEL_TOP     4 //( 0, -1)
+#define ME_QUAR_PIXEL_BOTTOM  5 //( 0,  1)
+
+#define NO_BEST_FRAC_PIX   1 // REFINE_ME_NO_BEST_HALF_PIXEL + ME_NO_BEST_QUAR_PIXEL
+
+extern const int32_t g_kiQpCostTable[52];
+extern const int8_t g_kiMapModeI16x16[7];
+//extern const int8_t g_kiMapModeI4x4[14];
+extern const int8_t g_kiMapModeIntraChroma[7];	
+
+/////////////////////////////
+
+// if we want keep total sizeof(SWelsMD) <= 256, we maybe need to seperate three member of SWelsME.
+typedef struct TagWelsMD
+{
+    int32_t			iLambda;
+	uint16_t		*pMvdCost;
+
+	int32_t			iCostLuma;
+    int32_t			iCostChroma;//satd+lambda(best_pred_mode) //i_sad_chroma;
+	int32_t			iSadPredMb; 
+
+    uint8_t			uiRef; //uiRefIndex appointed by Encoder, used for MC
+    bool_t			bMdUsingSad;
+    uint16_t		uiReserved;
+
+	int32_t			iCostSkipMb;
+    int32_t			iSadPredSkip;
+    
+	//NO B frame in our Wels, we can ignore list1
+
+	struct 
+	{		
+		SWelsME			sMe16x16;		//adjust each SWelsME for 8 D-word!
+		SWelsME			sMe8x8[4];
+		SWelsME			sMe16x8[2];
+		SWelsME			sMe8x16[2];				
+//		SMVUnitXY		i_mvbs[MB_BLOCK8x8_NUM];	//scaled MVB
+	} sMe;    
+
+}SWelsMD;
+
+typedef struct TagMeRefinePointer
+{
+	uint8_t* pHalfPixH;
+	uint8_t* pHalfPixV;
+	uint8_t* pHalfPixHV;
+
+	uint8_t* pQuarPixBest;
+	uint8_t* pQuarPixTmp; 
+
+} SMeRefinePointer;
+
+static void md_intra_init(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb);
+static void md_inter_init(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb);
+
+void FillNeighborCacheIntra(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth/*, bool_t constrained_intra_pred_flag*/);
+void FillNeighborCacheInterWithoutBGD(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag); //BGD spatial func
+void FillNeighborCacheInterWithBGD(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag);
+void InitFillNeighborCacheInterFunc( SWelsFuncPtrList *pFuncList, const int32_t kiFlag );
+
+void MvdCostInit( uint16_t* pMvdCostInter, const int32_t kiMvdSz );
+
+void PredictSad( int8_t* pRefIndexCache, int32_t* pSadCostCache, int32_t uiRef, int32_t * pSadPred );
+
+
+void PredictSadSkip( int8_t* pRefIndexCache, bool_t* pMbSkipCache, int32_t* pSadCostCache, int32_t uiRef, int32_t * iSadPredSkip );
+
+//  for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
+void InitIntraAnalysisVaaInfo( SWelsFuncPtrList *pFuncList, const uint32_t kuiCpuFlag );
+BOOL_T MdIntraAnalysisVaaInfo( sWelsEncCtx* pEncCtx, uint8_t* pEncMb );
+
+uint8_t MdInterAnalysisVaaInfo_c( int32_t *pSad8x8 );
+
+
+void InitMeRefinePointer(SMeRefinePointer* pMeRefine, SMbCache* pMbCache, int32_t iStride);
+void MeRefineFracPixel(sWelsEncCtx* pEncCtx, uint8_t* pMemPredInterMb, SWelsME* pMe,
+						  SMeRefinePointer* pMeRefine, int32_t iWidth, int32_t iHeight);
+								 
+void InitBlkStrideWithRef(int32_t* pBlkStride, const int32_t kiStrideRef);
+
+void UpdateMbMv_c( SMVUnitXY *pMvBuffer, const SMVUnitXY ksMv );
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+
+//  for pfGetVarianceFromIntraVaa SIMD optimization, 6/7/2010
+int32_t AnalysisVaaInfoIntra_sse2 (	uint8_t *pDataY, const int32_t kiLineSize );
+int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t kiLineSize );
+uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 );
+uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 );
+void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY ksMv );
+
+#endif//X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+}
+#endif//WELS_MACROBLOCK_MODE_DECISION_H__
+
--- /dev/null
+++ b/codec/encoder/core/inc/measure_time.h
@@ -1,0 +1,107 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	measure_time.h
+ *
+ * \brief	time cost measure utilization
+ *
+ * \date	04/28/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_TIME_COST_MEASURE_UTIL_H__
+#define WELS_TIME_COST_MEASURE_UTIL_H__
+
+#include <stdlib.h>
+
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+#include <sys/time.h>
+#else
+#include "typedefs.h"
+//#include <sys/types.h>
+#include <sys/timeb.h>
+#endif
+#include <time.h>
+#if defined(WIN32)
+#include <windows.h>
+//#include <mmsystem.h>	// need static lib winmm.lib for link for such windows 95/98 mm timer
+#endif//#if WIN32
+
+/*!
+ * \brief	time cost measure utilization
+ * \param	void
+ * \return	time elapsed since run (unit: microsecond)
+ */
+
+static inline int64_t WelsTime()
+{
+#if !(defined(_MSC_VER) || defined(__MINGW32__))
+	struct timeval tv_date;
+	
+	gettimeofday( &tv_date, NULL );
+	return( (int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec );
+#else
+#if defined (WIN32)	
+	static int64_t iMeasureTimeFreq = 0;
+//	static BOOL_T support_high_resolution_perf_flag = TRUE;
+	int64_t iMeasureTimeCur = 0;
+	int64_t iResult = 0;	
+	if ( 0 == iMeasureTimeFreq ){
+		// Per MSDN minimum supported OS is Windows 2000 Professional/Server above for high-resolution performance counter
+		/*BOOL_T ret = */QueryPerformanceFrequency((LARGE_INTEGER *)&iMeasureTimeFreq);
+//		if ( !ret )	// the installed hardware can not support a high-resolution performance counter, we have to use others instead for well feature
+//		{
+//			support_high_resolution_perf_flag	= FALSE;			
+//		}
+		if ( !iMeasureTimeFreq )
+			iMeasureTimeFreq = 1;
+	}
+//	if ( support_high_resolution_perf_flag )
+//	{
+		QueryPerformanceCounter((LARGE_INTEGER *)&iMeasureTimeCur);
+		iResult = (int64_t)((double)iMeasureTimeCur * 1e6 / (double)iMeasureTimeFreq + 0.5);
+//	}
+//	else
+//	{
+//		iResult = timeGetTime() * 1000;	// 10 ms precision		
+//	}	
+	return iResult;
+	
+#else
+	struct _timeb tb;
+	
+	_ftime(&tb);
+	return ((int64_t)tb.time * (1000) + (int64_t)tb.millitm) * (1000);
+#endif//#if WIN32
+#endif//!(defined(_MSC_VER) || defined(__MINGW32__))
+}
+
+#endif//WELS_TIME_COST_MEASURE_UTIL_H__
--- /dev/null
+++ b/codec/encoder/core/inc/memory_align.h
@@ -1,0 +1,80 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#if !defined(WELS_ENCODER_MEMORY_ALIGN_H__)
+#define WELS_ENCODER_MEMORY_ALIGN_H__
+
+#include "typedefs.h"
+#include "as264_common.h"
+#ifdef MEMORY_CHECK
+#include <stdio.h>
+#endif//MEMORY_CHECK
+
+namespace WelsSVCEnc {
+
+#define MEMORY_REQUEST_ALIGN_BYTES	0 // or (1^n), i.e, 0x04
+
+class CMemoryAlign
+{
+public:
+	CMemoryAlign( const uint32_t kuiCacheLineSize );
+	virtual ~CMemoryAlign();
+
+	void* WelsMallocz( const uint32_t kuiSize, const str_t *kpTag );
+	void* WelsMalloc( const uint32_t kuiSize, const str_t *kpTag );
+	void WelsFree( void* pPointer, const str_t *kpTag );
+	const uint32_t WelsGetCacheLineSize() const;
+#if defined(MEMORY_MONITOR)
+	const uint32_t WelsGetMemoryUsage() const;
+#endif//MEMORY_MONITOR
+
+private:
+	// private copy & assign constructors adding to fix klocwork scan issues
+	CMemoryAlign( const CMemoryAlign& kcMa );           
+	CMemoryAlign& operator=( const CMemoryAlign& kcMa );
+
+protected:
+	uint32_t	m_nCacheLineSize;
+
+#ifdef MEMORY_MONITOR
+	uint32_t	m_nMemoryUsageInBytes;
+#endif//MEMORY_MONITOR
+
+#ifdef MEMORY_CHECK
+	FILE*		m_fpMemChkPoint;
+	uint32_t	m_nCountRequestNum;
+#endif//MEMORY_CHECK
+};
+
+}
+
+#endif//WELS_ENCODER_MEMORY_ALIGN_H__
--- /dev/null
+++ b/codec/encoder/core/inc/mt_defs.h
@@ -1,0 +1,227 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mt_defs.h
+ *
+ * \brief	Main macros for multiple threading implementation
+ *
+ * \date	2/26/2010 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(MULTIPLE_THREADING_DEFINES_H__)
+#define MULTIPLE_THREADING_DEFINES_H__
+
+#include "typedefs.h"
+#include "codec_app_def.h"
+#include "wels_const.h"
+#include "WelsThreadLib.h"
+
+/*
+ *	Dynamic Slicing Assignment (DSA)
+ */
+#define DYNAMIC_SLICE_ASSIGN
+/*
+ *	Try to do dynamic slicing for multiple threads sync based on history slicing complexity result,
+ *	valid in case DYNAMIC_SLICE_ASSIGN enabled. In case it is disabled using step interval slicing map for DSA
+ */
+#define TRY_SLICING_BALANCE
+/*
+ *	not absolute balancing, tolerant conditions for dynamic adjustment
+ */
+#define NOT_ABSOLUTE_BALANCING
+/*
+ *  using root mean square error of slice complexity ratios for balancing
+ */
+#define USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+
+/*
+ *  REQUIREMENT FROM NOT BEING ABLE TO SUPPORT ASO ON GPU BASED DECODER
+ */
+#define RASTER_SCAN_ORDER_PACKING	// Arbitary SSlice Ordering (ASO) exclusive
+
+/*
+ *	Parallel slice bs output without memcpy used
+ *  NOTE: might be not applicable for SVC 2.0/2.1 client application layer implementation 
+ *	due bs of various slices need be continuous within a layer packing
+ */
+//#define PACKING_ONE_SLICE_PER_LAYER	// MEAN packing only slice for a pLayerBs, disabled at SVC 2.0/2.1 in case Multi-Threading (MT) & Multi-SSlice (MS)
+
+//#define FIXED_PARTITION_ASSIGN	// for dynamic slicing parallelization, mean same partition number used in P or I slices
+
+/*
+ * Need disable PACKING_ONE_SLICE_PER_LAYER if RASTER_SCAN_ORDER_PACKING enabled
+ * PACKING_ONE_SLICE_PER_LAYER might potentially introduce disordering slice packing into layer info for application layer
+ */
+#if defined(RASTER_SCAN_ORDER_PACKING)
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+#undef PACKING_ONE_SLICE_PER_LAYER
+#endif//PACKING_ONE_SLICE_PER_LAYER
+#endif//RASTER_SCAN_ORDER_PACKING
+
+/*
+ *	MT_DEBUG: output trace MT related into log file
+ */
+//#define MT_DEBUG
+//#define ENABLE_TRACE_MT
+
+#ifdef MT_ENABLED
+
+#define DYNAMIC_DETECT_CPU_CORES
+
+//#if defined(WIN32)
+//#define BIND_CPU_CORES_TO_THREADS	// if it is not defined here mean cross cpu cores load balance automatically
+//#endif//WIN32
+
+#else
+
+#endif//MT_ENABLED
+
+/*
+ * TO Check macros dependencies MT related
+ */
+
+#if !defined(DYNAMIC_SLICE_ASSIGN)
+
+#if defined(TRY_SLICING_BALANCE)
+#undef TRY_SLICING_BALANCE
+#endif//TRY_SLICING_BALANCE
+
+#endif//!DYNAMIC_SLICE_ASSIGN
+
+#if !defined(DYNAMIC_SLICE_ASSIGN) || !defined(TRY_SLICING_BALANCE)
+
+#if defined(NOT_ABSOLUTE_BALANCING)
+#undef NOT_ABSOLUTE_BALANCING
+#endif//NOT_ABSOLUTE_BALANCING
+
+#if defined(USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING)
+#undef USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+
+#endif//!DYNAMIC_SLICE_ASSIGN || !TRY_SLICING_BALANCE
+
+#if !defined(MT_ENABLED)
+
+#if defined(DYNAMIC_SLICE_ASSIGN)
+#undef DYNAMIC_SLICE_ASSIGN
+#endif//DYNAMIC_SLICE_ASSIGN
+#if defined(TRY_SLICING_BALANCE)
+#undef TRY_SLICING_BALANCE
+#endif//TRY_SLICING_BALANCE
+#if defined(MT_DEBUG)
+#undef MT_DEBUG
+#endif//MT_DEBUG
+#if defined(ENABLE_TRACE_MT)
+#undef ENABLE_TRACE_MT
+#endif//ENABLE_TRACE_MT
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+#undef PACKING_ONE_SLICE_PER_LAYER
+#endif//PACKING_ONE_SLICE_PER_LAYER
+#ifdef NOT_ABSOLUTE_BALANCING
+#undef NOT_ABSOLUTE_BALANCING
+#endif//NOT_ABSOLUTE_BALANCING
+#ifdef USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+#undef USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+
+#endif//!MT_ENABLED
+
+
+#ifdef NOT_ABSOLUTE_BALANCING
+#ifdef USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+#define THRESHOLD_RMSE_CORE8	0.0320f	// v1.1: 0.0320f; v1.0: 0.02f
+#define THRESHOLD_RMSE_CORE4	0.0215f	// v1.1: 0.0215f; v1.0: 0.03f
+#define THRESHOLD_RMSE_CORE2	0.0200f	// v1.1: 0.0200f; v1.0: 0.04f
+#else
+#define TOLERANT_BALANCING_RATIO_LOSS	0.08f
+#define TOLERANT_BALANCING_RATIO_LOWER(n)	((1.0f-TOLERANT_BALANCING_RATIO_LOSS)/(n))
+#define TOLERANT_BALANCING_RATIO_UPPER(n)	((1.0f+TOLERANT_BALANCING_RATIO_LOSS)/(n))
+#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+#endif//NOT_ABSOLUTE_BALANCING
+
+typedef struct TagSliceThreadPrivateData {
+	void		*pWelsPEncCtx;
+	SLayerBSInfo	*pLayerBs;
+	int32_t		iSliceIndex;	// slice index, zero based								
+	int32_t		iThreadIndex;	// thread index, zero based
+
+	// for dynamic slicing mode
+	int32_t		iStartMbIndex;	// inclusive
+	int32_t		iEndMbIndex;	// exclusive
+} SSliceThreadPrivateData;
+
+typedef struct TagSliceThreading 
+{
+	SSliceThreadPrivateData	*pThreadPEncCtx;// thread context, [iThreadIdx]
+	WELS_THREAD_HANDLE			*pThreadHandles;// thread handles, [iThreadIdx]
+#ifdef WIN32
+	WELS_EVENT					*pSliceCodedEvent;// events for slice coded state, [iThreadIdx]
+	WELS_EVENT					*pReadySliceCodingEvent;	// events for slice coding ready, [iThreadIdx]
+	WELS_EVENT					*pFinSliceCodingEvent;	// notify slice coding thread is done
+	WELS_EVENT					*pExitEncodeEvent;			// event for exit encoding event
+#else
+	WELS_EVENT*					pSliceCodedEvent[MAX_THREADS_NUM];// events for slice coded state, [iThreadIdx]
+	WELS_EVENT*					pReadySliceCodingEvent[MAX_THREADS_NUM];	// events for slice coding ready, [iThreadIdx]
+#endif//WIN32
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(__GNUC__)
+	WELS_THREAD_HANDLE			*pUpdateMbListThrdHandles;	// thread handles for update mb list thread, [iThreadIdx]
+#endif//__GNUC__
+#ifdef WIN32
+	WELS_EVENT					*pUpdateMbListEvent;		// signal to update mb list neighbor for various slices
+	WELS_EVENT					*pFinUpdateMbListEvent;	// signal to indicate finish updating mb list
+#else
+	WELS_EVENT*					pUpdateMbListEvent[MAX_THREADS_NUM];		// signal to update mb list neighbor for various slices
+	WELS_EVENT*					pFinUpdateMbListEvent[MAX_THREADS_NUM];	// signal to indicate finish updating mb list	
+#endif//WIN32
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+	WELS_MUTEX					mutexSliceNumUpdate;	// for dynamic slicing mode MT
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+	uint32_t					*pSliceConsumeTime[MAX_DEPENDENCY_LAYER];	// consuming time for each slice, [iSpatialIdx][uiSliceIdx]
+#endif//DYNAMIC_SLICE_ASSIGN || MT_DEBUG
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+	float						*pSliceComplexRatio[MAX_DEPENDENCY_LAYER];
+#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+
+#ifdef MT_DEBUG
+	FILE						*pFSliceDiff;	// file handle for debug
+#endif//MT_DEBUG
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+	uint32_t					*pCountBsSizeInPartition;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+} SSliceThreading;
+
+#endif//MULTIPLE_THREADING_DEFINES_H__
--- /dev/null
+++ b/codec/encoder/core/inc/mv_pred.h
@@ -1,0 +1,142 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mv_pred.h
+ *
+ * \brief	Get MV predictor and update motion vector of mb cache
+ *
+ * \date	05/22/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_MV_PRED_H__
+#define WELS_MV_PRED_H__
+
+
+#include "svc_enc_macroblock.h"
+#include "mb_cache.h"
+
+namespace WelsSVCEnc {
+/*!
+ * \brief   update pMv and uiRefIndex cache for current MB, only for P_16x16 (SKIP inclusive)
+ * \param 	
+ * \param 	
+ */
+
+/*!
+ * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_16x16 (SKIP inclusive)
+ * \param 	
+ * \param 	
+ */
+void UpdateP16x16MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int8_t kiRef, SMVUnitXY* pMv);//for encoder
+
+/*!
+ * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_16x8
+ * \param 	
+ * \param 	
+ */
+void UpdateP16x8MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv);
+
+/*!
+ * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_8x16
+ * \param 	
+ * \param 	
+ */
+void update_P8x16_motion_info(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv);
+
+/*!
+ * \brief   update pMv and uiRefIndex cache for current MB and pMbCache, only for P_8x8
+ * \param 	
+ * \param 	
+ */
+void UpdateP8x8MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv);
+
+/*!
+ * \brief   get the motion predictor for 4*4 or 8*8 or 16*16 block
+ * \param 	
+ * \param 	output mvp_x and mvp_y
+ */
+void PredMv(const SMVComponentUnit* kpMvComp, int8_t iPartIdx, int8_t iPartW, int32_t iRef, SMVUnitXY* sMvp);
+
+
+/*!
+ * \brief   get the motion predictor for SKIP MB
+ * \param 	
+ * \param 	output mvp_x and mvp_y
+ */
+void PredSkipMv(SMbCache* pMbCache, SMVUnitXY* sMvp);
+
+
+/*!
+ * \brief   get the motion predictor for inter16x8 MB
+ * \param 	
+ * \param 	output mvp_x and mvp_y
+ */
+void PredInter16x8Mv(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp);
+
+
+/*!
+ * \brief   get the motion predictor for inter8x16 MB
+ * \param 	
+ * \param 	output mvp_x and mvp_y
+ */
+void PredInter8x16Mv(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp);
+
+//=========================update motion info(MV and ref_idx) into Mb_cache==========================
+/*!
+ * \brief   only update pMv cache for current MB, only for P_16x16
+ * \param 	
+ * \param 	
+ */
+//void update_p16x16_motion2cache(SMbCache* pMbCache, int8_t pRef, SMVUnitXY* pMv);
+
+/*!
+ * \brief   only update pMv cache for current MB, only for P_16x8
+ * \param 	
+ * \param 	
+ */
+void UpdateP16x8Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+
+/*!
+ * \brief   only update pMv cache for current MB, only for P_8x16
+ * \param 	
+ * \param 	
+ */
+void UpdateP8x16Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+/*!
+ * \brief   only update pMv cache for current MB, only for P_8x8
+ * \param 	
+ * \param 	
+ */
+void UpdateP8x8Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv);
+}
+#endif//WELS_MV_PRED_H__
--- /dev/null
+++ b/codec/encoder/core/inc/nal_encap.h
@@ -1,0 +1,148 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	nal_encap.h
+ *
+ * \brief	NAL pRawNal pData encapsulation
+ *
+ * \date	2/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_NAL_UNIT_ENCAPSULATION_H__
+#define WELS_NAL_UNIT_ENCAPSULATION_H__
+
+#include "typedefs.h"
+#include "bit_stream.h"
+#include "nal_prefix.h"
+
+//SBitStringAux
+namespace WelsSVCEnc {
+
+/*
+ *	Raw payload pData for NAL unit, AVC/SVC compatible
+ */
+typedef struct TagWelsNalRaw {
+	uint8_t				*pRawData;		// pRawNal payload for slice pData
+	int32_t				iPayloadSize;		// size of pRawNal pData
+	
+	SNalUnitHeaderExt		sNalExt;		// NAL header information
+
+}SWelsNalRaw;
+
+/*
+ *	Encoder majoy output pData
+ */
+typedef struct TagWelsEncoderOutput {	
+	uint8_t				*pBsBuffer;			// overall bitstream pBuffer allocation for a coded picture, recycling use intend. 
+	uint32_t			uiSize;				// size of allocation pBuffer above
+
+	SBitStringAux		sBsWrite;
+	
+//	SWelsNalRaw		raw_nals[MAX_DEPENDENCY_LAYER*2+MAX_DEPENDENCY_LAYER*MAX_QUALITY_LEVEL]; // AVC: max up to SPS+PPS+max_slice_idc (2 + 8) for FMO;
+	SWelsNalRaw		*sNalList;			// nal list, adaptive for AVC/SVC in case single slice, multiple slices or fmo
+	int32_t				iCountNals;			// count number of NAL in list
+																								 // SVC: num_sps (MAX_D) + num_pps (MAX_D) + num_vcl (MAX_D * MAX_Q)	
+	int32_t				iNalIndex;			// coding NAL currently, 0 based
+	
+//	BOOL_T				bAnnexBFlag;		// annexeb flag, to figure it pOut the packetization mode whether need 4 bytes (0 0 0 1) of start code prefix
+}SWelsEncoderOutput;
+
+//#define MT_DEBUG_BS_WR	0	// for MT debugging if needed
+
+typedef struct TagWelsSliceBs {
+	uint8_t				*pBs;				// output bitstream, pBitStringAux not needed for slice 0 due to no dependency of pFrameBs available
+	uint32_t			uiBsPos;				// position of output bitstream
+	uint8_t				*pBsBuffer;			// overall bitstream pBuffer allocation for a coded slice, recycling use intend. 
+	uint32_t			uiSize;				// size of allocation pBuffer above
+	
+	SBitStringAux		sBsWrite;
+		
+	SWelsNalRaw		sNalList[2];		// nal list, PREFIX NAL(if applicable) + SLICE NAL
+//	int32_t				iCountNals;			// count number of NAL in list
+	int32_t				iNalLen[2];
+	int32_t				iNalIndex;			// coding NAL currently, 0 based	
+	
+//	BOOL_T				bAnnexBFlag;		// annexeb flag, to figure it pOut the packetization mode whether need 4 bytes (0 0 0 1) of start code prefix
+#if MT_DEBUG_BS_WR
+	BOOL_T				bSliceCodedFlag;
+#endif//MT_DEBUG_BS_WR
+}SWelsSliceBs;
+
+/*!
+ * \brief	load an initialize NAL pRawNal pData	
+ */
+void WelsLoadNal( SWelsEncoderOutput *pEncoderOuput, const int32_t/*EWelsNalUnitType*/ kiType, const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc );
+
+/*!
+ * \brief	unload pRawNal NAL
+ */
+void WelsUnloadNal( SWelsEncoderOutput *pEncoderOuput );
+
+/*!
+ * \brief	load an initialize NAL pRawNal pData	
+ */
+void WelsLoadNalForSlice( SWelsSliceBs *pSliceBs, const int32_t/*EWelsNalUnitType*/ kiType, const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc );
+
+/*!
+ * \brief	unload pRawNal NAL
+ */
+void WelsUnloadNalForSlice( SWelsSliceBs *pSliceBs );
+
+/*!
+ * \brief	encode NAL with emulation forbidden three bytes checking
+ * \param	pDst			pDst NAL pData
+ * \param	pDstLen		length of pDst NAL output
+ * \param	annexeb		annexeb flag
+ * \param	pRawNal			pRawNal NAL pData
+ * \return	length of pDst NAL
+ */
+int32_t WelsEncodeNal( SWelsNalRaw *pRawNal, void *pDst, int32_t *pDstLen );
+
+/*!
+ * \brief	encode a nal into a pBuffer for any type of NAL, involved WelsEncodeNal introduced in AVC
+ *
+ * \param	pDst			pDst NAL pData
+ * \param	pDstLen		length of pDst NAL output
+ * \param	annexeb		annexeb flag
+ * \param	pRawNal			pRawNal NAL pData
+ * \param	pNalHeaderExt	pointer of SNalUnitHeaderExt
+ *
+ * \return	length of pDst NAL
+ */
+int32_t WelsEncodeNalExt( SWelsNalRaw *pRawNal, void *pNalHeaderExt, void *pDst, int32_t *pDstLen );
+
+/*!
+ * \brief	write prefix nal
+ */
+int32_t WelsWriteSVCPrefixNal( SBitStringAux *pBitStringAux, const int32_t keNalRefIdc,const bool_t kbIdrFlag );
+}
+#endif//WELS_NAL_UNIT_ENCAPSULATION_H__
--- /dev/null
+++ b/codec/encoder/core/inc/nal_prefix.h
@@ -1,0 +1,64 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//nal_prefix.h	-	definitions for NAL Unit Header(/Ext) and PrefixNALUnit
+#ifndef WELS_NAL_UNIT_PREFIX_H__
+#define WELS_NAL_UNIT_PREFIX_H__
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "slice.h"
+
+namespace WelsSVCEnc {
+///////////////////////////////////NAL Unit prefix/headers///////////////////////////////////
+
+/* NAL Unix Header in AVC, refer to Page 56 in JVT X201wcm */
+typedef struct TagNalUnitHeader{
+	uint8_t		uiForbiddenZeroBit;
+	uint8_t		uiNalRefIdc;
+	EWelsNalUnitType	eNalUnitType;
+	uint8_t		uiReservedOneByte;		
+}SNalUnitHeader, *PNalUnitHeader;
+
+/* NAL Unit Header in scalable extension syntax, refer to Page 390 in JVT X201wcm */
+typedef struct TagNalUnitHeaderExt{
+	SNalUnitHeader	sNalHeader;
+	
+	bool_t		bIdrFlag;
+	uint8_t		uiDependencyId;
+	uint8_t		uiTemporalId;
+	bool_t		bDiscardableFlag;
+	
+
+}SNalUnitHeaderExt, *PNalUnitHeaderExt;
+}
+#endif//WELS_NAL_UNIT_PREFIX_H__
--- /dev/null
+++ b/codec/encoder/core/inc/param_svc.h
@@ -1,0 +1,483 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	param_svc.h
+ *
+ * \brief	Configurable parameters in H.264/SVC Encoder
+ *
+ * \date	4/20/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_ENCODER_PARAMETER_SVC_H__)
+#define WELS_ENCODER_PARAMETER_SVC_H__
+
+#include <string.h>
+#include <math.h>
+#include "typedefs.h"
+#include "codec_def.h"
+#include "macros.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "rc.h"
+#include "svc_enc_slice_segment.h"
+#include "as264_common.h"
+
+namespace WelsSVCEnc {
+
+#define   INVALID_TEMPORAL_ID   ((uint8_t)0xff)
+
+extern const uint8_t   g_kuiTemporalIdListTable[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE + 1];
+
+/*!
+* \brief	get Logarithms base 2 of (upper/base)
+* \param	base	based scaler
+* \param	upper	input upper value
+* \return	2 based scaling factor
+*/
+static __inline uint32_t GetLogFactor( real32_t base, real32_t upper )
+{
+	const double dLog2factor	= log10(1.0 * upper / base) / log10(2.0);
+	const double dEpsilon		= 0.0001;
+	const double dRound		= floor( dLog2factor + 0.5 );	
+
+	if( dLog2factor < dRound+dEpsilon && dRound < dLog2factor+dEpsilon )
+	{
+		return (uint32_t)(dRound);
+	}
+	return UINT_MAX;
+}
+
+/*
+ *	Dependency Layer Parameter
+ */
+typedef struct TagDLayerParam {
+	int32_t		iActualWidth;			// input source picture actual width
+	int32_t		iActualHeight;			// input source picture actual height
+	int32_t		iFrameWidth;			// frame width
+	int32_t		iFrameHeight;			// frame height
+
+	int32_t		iSpatialBitrate;
+
+	/* temporal settings related */
+	int32_t		iTemporalResolution;
+	int32_t		iDecompositionStages;	
+	uint8_t     uiCodingIdx2TemporalId[(1<<MAX_TEMPORAL_LEVEL)+1];
+
+	uint8_t		uiProfileIdc;			// value of profile IDC (0 for auto-detection)	
+
+	int8_t		iHighestTemporalId;
+	//	uint8_t		uiDependencyId;
+	int8_t      iDLayerQp;
+
+	SMulSliceOption sMso;	// multiple slice options
+
+	float		fInputFrameRate;		// input frame rate
+	float		fOutputFrameRate;		// output frame rate
+
+#ifdef ENABLE_FRAME_DUMP
+	str_t		sRecFileName[MAX_FNAME_LEN];	// file to be constructed
+#endif//ENABLE_FRAME_DUMP	
+} SDLayerParam;
+
+/*
+ *	Cisco OpenH264 Encoder Parameter Configuration
+ */
+typedef struct TagWelsSvcCodingParam {	
+	SDLayerParam	sDependencyLayers[MAX_DEPENDENCY_LAYER];
+
+	/* General */
+#ifdef ENABLE_TRACE_FILE
+    str_t			sTracePath[MAX_FNAME_LEN];		// log file for wels encoder
+#endif
+
+	uint32_t	uiGopSize;			// GOP size (at maximal frame rate: 16)
+	uint32_t	uiIntraPeriod;		// intra period (multiple of GOP size as desired)
+	int32_t		iNumRefFrame;		// number of reference frame used
+
+	int32_t     iActualPicWidth;    //   actual input picture width
+	int32_t     iActualPicHeight;   //   actual input picture height
+
+	struct {
+		int32_t iLeft;
+		int32_t iTop;
+		int32_t iWidth;
+		int32_t iHeight;
+	}SUsedPicRect;	// the rect in input picture that encoder actually used
+
+	str_t       *pCurPath; // record current lib path such as:/pData/pData/com.wels.enc/lib/ 
+
+	float		fMaxFrameRate;		// maximal frame rate [Hz / fps]
+	int32_t		iInputCsp;			// color space of input sequence	
+	uint32_t	uiFrameToBeCoded;	// frame to be encoded (at input frame rate)	
+
+	int32_t		iTargetBitrate;			// overall target bitrate introduced in RC module	
+	int16_t		iMultipleThreadIdc;		// 1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+	int16_t		iCountThreadsNum;			//		# derived from disable_multiple_slice_idc (=0 or >1) means;
+
+	int32_t		iLTRRefNum;
+	uint32_t    uiLtrMarkPeriod;	//the min distance of two int32_t references
+
+	bool_t		bDeblockingParallelFlag;	// deblocking filter parallelization control flag
+	bool_t		bMgsT0OnlyStrategy; //MGS_T0_only_strategy
+    bool_t		bEnableSSEI;		
+	bool_t		bEnableFrameCroppingFlag;	// enable frame cropping flag: TRUE alwayse in application
+	
+	bool_t		bEnableCropPic;			// enable cropping source picture. , 8/25/2010
+											// FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
+	int8_t		iDecompStages;		// GOP size dependency		
+
+	/* Deblocking loop filter */
+	int8_t		iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
+	int8_t		iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
+
+	int8_t		iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0	
+	int8_t		iInterLayerLoopFilterDisableIdc; // Employed based upon inter-layer, same comment as above
+	int8_t		iInterLayerLoopFilterAlphaC0Offset;	// InterLayerLoopFilterAlphaC0Offset
+	int8_t		iInterLayerLoopFilterBetaOffset;	// InterLayerLoopFilterBetaOffset
+
+	/* Rate Control */
+	bool_t		bEnableRc;
+	int8_t		iRCMode;	
+	int8_t		iPaddingFlag;
+	/* denoise control */
+	bool_t      bEnableDenoise;				
+
+	/* scene change detection control */
+	bool_t      bEnableSceneChangeDetect;	 
+	// background detection control
+	bool_t		bEnableBackgroundDetection; 
+	/* adaptive quantization control */
+	bool_t		bEnableAdaptiveQuant;	         
+	/* long term reference control */
+	bool_t      bEnableLongTermReference;
+
+	/* pSps pPps id addition control */
+	bool_t      bEnableSpsPpsIdAddition;
+	/* Layer definition */
+	bool_t		bPrefixNalAddingCtrl;
+	int8_t		iNumDependencyLayer;	// number of dependency(Spatial/CGS) layers used to be encoded
+	int8_t		iNumTemporalLayer;		// number of temporal layer specified
+    
+
+    
+public:
+	TagWelsSvcCodingParam(const bool_t kbEnableRc = true)
+	{
+		FillDefault( kbEnableRc );
+	}
+	~TagWelsSvcCodingParam()	{}
+
+	void FillDefault( const bool_t kbEnableRc )
+	{
+		uiGopSize			= 1;			// GOP size (at maximal frame rate: 16)
+		uiIntraPeriod		= 0;			// intra period (multiple of GOP size as desired)
+		iNumRefFrame		= MIN_REF_PIC_COUNT;	// number of reference frame used
+
+		iActualPicWidth	= 0;    //   actual input picture width
+		iActualPicHeight	= 0;	//   actual input picture height
+		SUsedPicRect.iLeft	=
+		SUsedPicRect.iTop	=
+		SUsedPicRect.iWidth	=
+		SUsedPicRect.iHeight= 0;	// the rect in input picture that encoder actually used
+
+		pCurPath			= NULL; // record current lib path such as:/pData/pData/com.wels.enc/lib/ 
+
+		fMaxFrameRate		= MAX_FRAME_RATE;	// maximal frame rate [Hz / fps]		
+		iInputCsp			= videoFormatI420;	// input sequence color space in default
+		uiFrameToBeCoded	= (uint32_t)-1;		// frame to be encoded (at input frame rate)
+
+		iTargetBitrate			= 0;	// overall target bitrate introduced in RC module
+		bDeblockingParallelFlag= false;	// deblocking filter parallelization control flag
+#ifdef MT_ENABLED
+		iMultipleThreadIdc		= 0;	// auto to detect cpu cores inside
+#else
+		iMultipleThreadIdc		= 1;	// 1 # 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+#endif//MT_ENABLED
+		iCountThreadsNum		= 1;	//		# derived from disable_multiple_slice_idc (=0 or >1) means;
+
+		iLTRRefNum				= 0;
+		uiLtrMarkPeriod			= 30;	//the min distance of two int32_t references		
+
+		bMgsT0OnlyStrategy			= true;	// Strategy of have MGS only at T0 frames (0: do not use this strategy; 1: use this strategy) 
+		bEnableSSEI					= true;
+		bEnableFrameCroppingFlag	= true;	// enable frame cropping flag: TRUE alwayse in application
+		bEnableCropPic				= true;	// enable cropping source picture. , 8/25/2010
+		// FALSE: Streaming Video Sharing; TRUE: Video Conferencing Meeting;
+		iDecompStages				= 0;	// GOP size dependency, unknown here and be revised later
+
+		/* Deblocking loop filter */
+		iLoopFilterDisableIdc		= 1;	// 0: on, 1: off, 2: on except for slice boundaries
+		iLoopFilterAlphaC0Offset	= 0;	// AlphaOffset: valid range [-6, 6], default 0
+		iLoopFilterBetaOffset		= 0;	// BetaOffset:	valid range [-6, 6], default 0
+		iInterLayerLoopFilterDisableIdc		= 1;	// Employed based upon inter-layer, same comment as above
+		iInterLayerLoopFilterAlphaC0Offset	= 0;	// InterLayerLoopFilterAlphaC0Offset
+		iInterLayerLoopFilterBetaOffset		= 0;	// InterLayerLoopFilterBetaOffset
+
+		/* Rate Control */
+		bEnableRc		= kbEnableRc;	
+		iRCMode			= 0;	
+		iPaddingFlag	= 0;
+		
+		bEnableDenoise				= false;	// denoise control		
+		bEnableSceneChangeDetect	= true;		// scene change detection control		
+		bEnableBackgroundDetection	= true;		// background detection control		
+		bEnableAdaptiveQuant		= true;		// adaptive quantization control		
+		bEnableLongTermReference	= false;	// long term reference control		
+		bEnableSpsPpsIdAddition	= true;		// pSps pPps id addition control		
+		bPrefixNalAddingCtrl		= true;		// prefix NAL adding control
+		iNumDependencyLayer		= 0;		// number of dependency(Spatial/CGS) layers used to be encoded
+		iNumTemporalLayer			= 0;		// number of temporal layer specified		
+	}
+
+	int32_t ParamTranscode( SVCEncodingParam& pCodingParam, const bool_t kbEnableRc = true )
+	{		
+		pCodingParam.fFrameRate		= WELS_CLIP3(pCodingParam.fFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);
+		iInputCsp		= pCodingParam.iInputCsp;		// color space of input sequence	
+		uiFrameToBeCoded	= (uint32_t)-1;		// frame to be encoded (at input frame rate), -1 dependents on length of input sequence
+
+		iActualPicWidth   = pCodingParam.iPicWidth;
+		iActualPicHeight  = pCodingParam.iPicHeight; 
+
+		SUsedPicRect.iLeft = 0;
+		SUsedPicRect.iTop  = 0;
+		SUsedPicRect.iWidth = ((iActualPicWidth >> 1) << 1);
+		SUsedPicRect.iHeight = ((iActualPicHeight >> 1) << 1);
+
+		/* Deblocking loop filter */
+#ifdef MT_ENABLED
+		iLoopFilterDisableIdc	= 2;//pCodingParam.iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries, 
+#else
+		iLoopFilterDisableIdc	= 0;	// 0: on, 1: off, 2: on except for slice boundaries
+#endif
+		iLoopFilterAlphaC0Offset= 0;	// AlphaOffset: valid range [-6, 6], default 0
+		iLoopFilterBetaOffset	= 0;	// BetaOffset:	valid range [-6, 6], default 0
+		iInterLayerLoopFilterDisableIdc	= iLoopFilterDisableIdc;	// Employed based upon inter-layer, same comment as above
+		iInterLayerLoopFilterAlphaC0Offset= 0;
+		iInterLayerLoopFilterBetaOffset	= 0;
+
+		bEnableFrameCroppingFlag	= true;
+
+		/* Rate Control */
+		bEnableRc			= kbEnableRc;
+		if (pCodingParam.iRCMode != RC_MODE0 && pCodingParam.iRCMode != RC_MODE1)
+			iRCMode = RC_MODE1;
+		else
+			iRCMode = pCodingParam.iRCMode;    // rc mode
+		iPaddingFlag= pCodingParam.iPaddingFlag;
+
+		iTargetBitrate		= pCodingParam.iTargetBitrate;	// target bitrate
+
+		/* Denoise Control */
+		bEnableDenoise = pCodingParam.bEnableDenoise ? true : false;    // Denoise Control  // only support 0 or 1 now  	
+
+		/* Scene change detection control */
+		bEnableSceneChangeDetect	= true;	   
+
+		/* Background detection Control */
+		bEnableBackgroundDetection = pCodingParam.bEnableBackgroundDetection ? true : false; 
+
+		/* Adaptive quantization control */
+		bEnableAdaptiveQuant	= pCodingParam.bEnableAdaptiveQuant ? true : false;	   
+
+		/* Enable cropping source picture */
+		bEnableCropPic	= pCodingParam.bEnableCropPic ? true : false;
+
+		/* Enable int32_t term reference */
+		bEnableLongTermReference	= pCodingParam.bEnableLongTermReference ? true : false;
+		uiLtrMarkPeriod = pCodingParam.iLtrMarkPeriod;
+
+		/* For ssei information */
+		bEnableSSEI		= true;
+
+		/* Layer definition */
+		iNumDependencyLayer	= (int8_t)WELS_CLIP3(pCodingParam.iSpatialLayerNum, 1, MAX_DEPENDENCY_LAYER); // number of dependency(Spatial/CGS) layers used to be encoded
+		pCodingParam.iTemporalLayerNum = (int8_t)WELS_CLIP3(pCodingParam.iTemporalLayerNum, 1, MAX_TEMPORAL_LEVEL);	// safe valid iTemporalLayerNum		
+		iNumTemporalLayer		= (int8_t)pCodingParam.iTemporalLayerNum;//(int8_t)WELS_CLIP3(pCodingParam.iTemporalLayerNum, 1, MAX_TEMPORAL_LEVEL);// number of temporal layer specified		
+
+		uiGopSize			= 1 << (iNumTemporalLayer-1);	// Override GOP size based temporal layer
+		iDecompStages		= iNumTemporalLayer-1;	// WELS_LOG2( uiGopSize );// GOP size dependency
+		uiIntraPeriod		= pCodingParam.iIntraPeriod;// intra period (multiple of GOP size as desired)
+		if ( uiIntraPeriod == (uint32_t)(-1) )
+			uiIntraPeriod = 0;
+		else if ( uiIntraPeriod & uiGopSize )	// none multiple of GOP size
+			uiIntraPeriod = ((uiIntraPeriod+uiGopSize-1) / uiGopSize) * uiGopSize;
+
+		iLTRRefNum = bEnableLongTermReference ? LONG_TERM_REF_NUM : 0;
+		iNumRefFrame		= ((uiGopSize>>1)>1)?((uiGopSize>>1)+iLTRRefNum):(MIN_REF_PIC_COUNT+iLTRRefNum);
+		iNumRefFrame		= WELS_CLIP3( iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM );	
+
+		uiLtrMarkPeriod  = pCodingParam.iLtrMarkPeriod;
+
+		bPrefixNalAddingCtrl	= pCodingParam.bPrefixNalAddingCtrl;	
+		
+		bEnableSpsPpsIdAddition = pCodingParam.bEnableSpsPpsIdAddition;//For SVC meeting application, to avoid mosaic issue caused by cross-IDR reference. 
+		                                                               //SHOULD enable this feature.  
+
+		SDLayerParam *pDlp		= &sDependencyLayers[0];
+		float fMaxFr			= .0f;
+		uint8_t uiProfileIdc		= PRO_BASELINE;
+		int8_t iIdxSpatial	= 0;
+		while(iIdxSpatial < iNumDependencyLayer)
+		{
+			pDlp->uiProfileIdc		= uiProfileIdc;	
+
+			pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate	= WELS_CLIP3(pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate, MIN_FRAME_RATE, pCodingParam.fFrameRate);
+			pDlp->fInputFrameRate	= 
+			pDlp->fOutputFrameRate	= WELS_CLIP3(pCodingParam.sSpatialLayers[iIdxSpatial].fFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);
+			if (pDlp->fInputFrameRate > fMaxFr+EPSN)
+				fMaxFr = pDlp->fInputFrameRate;
+
+#ifdef ENABLE_FRAME_DUMP
+			pDlp->sRecFileName[0]	= '\0';	// file to be constructed
+#endif//ENABLE_FRAME_DUMP
+			pDlp->iFrameWidth		= pCodingParam.sSpatialLayers[iIdxSpatial].iVideoWidth;	// frame width
+			pDlp->iFrameHeight		= pCodingParam.sSpatialLayers[iIdxSpatial].iVideoHeight;// frame height
+			pDlp->iSpatialBitrate	= pCodingParam.sSpatialLayers[iIdxSpatial].iSpatialBitrate;	// target bitrate for current spatial layer
+
+
+			//multi slice
+			pDlp->sMso.uiSliceMode = (SliceMode)pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.uiSliceMode;
+			pDlp->sMso.sSliceArgument.uiSliceSizeConstraint 
+				= (uint32_t)(pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceSizeConstraint);
+			pDlp->sMso.sSliceArgument.iSliceNum 
+				= pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceNum;
+			const int32_t kiLesserSliceNum = ((MAX_SLICES_NUM < MAX_SLICES_NUM_TMP) ? MAX_SLICES_NUM : MAX_SLICES_NUM_TMP);  
+			memcpy(pDlp->sMso.sSliceArgument.uiSliceMbNum, pCodingParam.sSpatialLayers[iIdxSpatial].sSliceCfg.sSliceArgument.uiSliceMbNum,	// confirmed_safe_unsafe_usage
+				kiLesserSliceNum * sizeof(uint32_t) ) ;
+
+			pDlp->iDLayerQp = SVC_QUALITY_BASE_QP;
+
+			uiProfileIdc	= PRO_SCALABLE_BASELINE;
+			++ pDlp;
+			++ iIdxSpatial;
+		}
+
+		fMaxFrameRate	= fMaxFr;
+
+		SetActualPicResolution();
+
+		return 0;
+	}
+
+	// assuming that the width/height ratio of all spatial layers are the same
+	
+	void SetActualPicResolution()
+	{
+		int32_t iSpatialIdx			= iNumDependencyLayer-1;
+		SDLayerParam *pDlayer		= &sDependencyLayers[iSpatialIdx];
+
+		for (; iSpatialIdx >= 0; iSpatialIdx -- )
+		{
+			pDlayer	= &sDependencyLayers[iSpatialIdx];
+
+			pDlayer->iActualWidth = pDlayer->iFrameWidth;
+			pDlayer->iActualHeight = pDlayer->iFrameHeight;
+			pDlayer->iFrameWidth = WELS_ALIGN(pDlayer->iActualWidth, MB_WIDTH_LUMA);
+			pDlayer->iFrameHeight = WELS_ALIGN(pDlayer->iActualHeight, MB_HEIGHT_LUMA);
+		}
+	}
+
+	/*!
+	* \brief	determined key coding tables for temporal scalability, uiProfileIdc etc for each spatial layer settings
+	* \param	SWelsSvcCodingParam, and carried with known GOP size, max, input and output frame rate of each spatial
+	* \return	NONE (should ensure valid parameter before this procedure)
+	*/
+	void DetermineTemporalSettings()
+	{		
+		const int32_t iDecStages		= WELS_LOG2( uiGopSize );	// (int8_t)GetLogFactor(1.0f, 1.0f * pcfg->uiGopSize);	//log2(uiGopSize)
+		const uint8_t *pTemporalIdList	= &g_kuiTemporalIdListTable[iDecStages][0];
+		SDLayerParam *pDlp				= &sDependencyLayers[0];
+		uint8_t uiProfileIdc				= PRO_BASELINE;
+		int8_t i						= 0;
+
+		while (i < iNumDependencyLayer )
+		{
+			const uint32_t kuiLogFactorInOutRate	= GetLogFactor(pDlp->fOutputFrameRate, pDlp->fInputFrameRate);
+			const uint32_t kuiLogFactorMaxInRate	= GetLogFactor(pDlp->fInputFrameRate, fMaxFrameRate);
+			int32_t iNotCodedMask= 0;
+			int8_t iMaxTemporalId = 0;
+
+			memset(pDlp->uiCodingIdx2TemporalId, INVALID_TEMPORAL_ID, sizeof(pDlp->uiCodingIdx2TemporalId));
+			pDlp->uiProfileIdc = uiProfileIdc;	// PRO_BASELINE, PRO_SCALABLE_BASELINE;			
+
+			iNotCodedMask	= (1 << (kuiLogFactorInOutRate + kuiLogFactorMaxInRate)) - 1;
+			for (uint32_t uiFrameIdx = 0; uiFrameIdx <= uiGopSize; ++ uiFrameIdx){						
+				if( 0 == (uiFrameIdx & iNotCodedMask) ) {				
+					const int8_t kiTemporalId = pTemporalIdList[uiFrameIdx];						
+					pDlp->uiCodingIdx2TemporalId[uiFrameIdx] = kiTemporalId;
+					if ( kiTemporalId > iMaxTemporalId )
+					{
+						iMaxTemporalId = kiTemporalId;
+					}
+				}
+			}
+
+			pDlp->iHighestTemporalId	= iMaxTemporalId;
+			pDlp->iTemporalResolution	= kuiLogFactorMaxInRate + kuiLogFactorInOutRate;
+			pDlp->iDecompositionStages	= iDecStages - kuiLogFactorMaxInRate - kuiLogFactorInOutRate;
+
+			uiProfileIdc	= PRO_SCALABLE_BASELINE;		
+			++ pDlp;
+			++ i;
+		}
+		iDecompStages = (int8_t)iDecStages;
+	}
+
+} SWelsSvcCodingParam;
+
+static inline int32_t FreeCodingParam( SWelsSvcCodingParam **pParam, CMemoryAlign *pMa )
+{
+	if (pParam == NULL || *pParam == NULL || pMa == NULL)
+		return 1;	
+	pMa->WelsFree(*pParam, "SWelsSvcCodingParam");
+	*pParam = NULL;
+	return 0;
+}
+
+static inline int32_t AllocCodingParam( SWelsSvcCodingParam **pParam, CMemoryAlign *pMa, const int32_t kiRequestNumSpatial )
+{
+	if ( pParam == NULL || pMa == NULL || kiRequestNumSpatial < 1 || kiRequestNumSpatial > MAX_SPATIAL_LAYER_NUM )
+		return 1;	
+	if (*pParam != NULL)
+	{
+		FreeCodingParam( pParam, pMa );
+	}
+	SWelsSvcCodingParam *pCodingParam = (SWelsSvcCodingParam *)pMa->WelsMalloc(sizeof(SWelsSvcCodingParam), "SWelsSvcCodingParam");
+	if ( NULL == pCodingParam )
+		return 1;
+	*pParam = pCodingParam;
+	return 0;
+}
+
+}//end of namespace WelsSVCEnc
+
+#endif//WELS_ENCODER_PARAMETER_SVC_H__
--- /dev/null
+++ b/codec/encoder/core/inc/parameter_sets.h
@@ -1,0 +1,165 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_PARAMETER_SETS_H__
+#define WELS_PARAMETER_SETS_H__
+
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+
+namespace WelsSVCEnc {
+//#pragma pack(1)
+
+/* Sequence Parameter Set, refer to Page 57 in JVT X201wcm */
+typedef struct TagWelsSPS{
+	uint32_t	uiSpsId;
+	int16_t		iMbWidth;
+	int16_t		iMbHeight;	
+	uint32_t	uiLog2MaxFrameNum;
+//	uint32_t	uiPocType;
+	/* POC type 0 */
+	int32_t		iLog2MaxPocLsb;
+	/* POC type 1 */
+//	int32_t		iOffsetForNonRefPic;
+
+//	int32_t		iOffsetForTopToBottomField;
+//	int32_t		iNumRefFramesInPocCycle;
+//	int8_t		iOffsetForRefFrame[256];	
+	SCropOffset	sFrameCrop;
+	int16_t		iNumRefFrames;	
+//	uint32_t	uiNumUnitsInTick;
+//	uint32_t	uiTimeScale;
+	
+	uint8_t		uiProfileIdc;
+	uint8_t		iLevelIdc;
+//	uint8_t		uiChromaFormatIdc;
+//	uint8_t		uiChromaArrayType;		//support =1
+	
+//	uint8_t		uiBitDepthLuma;         //=8, only used in decoder, encoder in general_***; it can be removed when removed general up_sample
+//	uint8_t		uiBitDepthChroma;		//=8
+	/* TO BE CONTINUE: POC type 1 */
+//	bool_t		bDeltaPicOrderAlwaysZeroFlag;	
+//	bool_t		bGapsInFrameNumValueAllowedFlag;	//=true
+
+//	bool_t		bFrameMbsOnlyFlag;
+//	bool_t		bMbaffFlag;	// MB Adapative Frame Field
+//	bool_t		bDirect8x8InferenceFlag;
+	bool_t		bFrameCroppingFlag;
+
+//	bool_t		bVuiParamPresentFlag;
+//	bool_t		bTimingInfoPresentFlag;
+//	bool_t		bFixedFrameRateFlag;
+
+	bool_t		bConstraintSet0Flag;
+	bool_t		bConstraintSet1Flag;
+	bool_t		bConstraintSet2Flag;
+
+//	bool_t		bConstraintSet3Flag;		// reintroduce constrain_set3_flag instead of reserved filling bytes here
+//	bool_t		bSeparateColorPlaneFlag;  // =false,: only used in decoder, encoder in general_***; it can be removed when removed general up_sample
+	
+}SWelsSPS, *PWelsSPS;
+
+
+/* Sequence Parameter Set SVC extension syntax, refer to Page 391 in JVT X201wcm */
+typedef struct TagSpsSvcExt{
+//	SCropOffset	sSeqScaledRefLayer;
+	
+	uint8_t		iExtendedSpatialScalability;	// ESS
+//	uint8_t		uiChromaPhaseXPlus1Flag;
+//	uint8_t		uiChromaPhaseYPlus1;
+//	uint8_t		uiSeqRefLayerChromaPhaseXPlus1Flag;
+//	uint8_t		uiSeqRefLayerChromaPhaseYPlus1;
+//	bool_t		bInterLayerDeblockingFilterCtrlPresentFlag;
+	bool_t		bSeqTcoeffLevelPredFlag;
+	bool_t		bAdaptiveTcoeffLevelPredFlag;
+	bool_t		bSliceHeaderRestrictionFlag;	
+}SSpsSvcExt, *PSpsSvcExt;
+
+/* Subset sequence parameter set syntax, refer to Page 391 in JVT X201wcm */
+typedef struct TagSubsetSps{	
+	SWelsSPS		pSps;
+	SSpsSvcExt	sSpsSvcExt;
+
+//	bool_t		bSvcVuiParamPresentFlag;	
+//	bool_t		bAdditionalExtension2Flag;
+//	bool_t		bAdditionalExtension2DataFlag;
+}SSubsetSps, *PSubsetSps;
+
+/* Picture parameter set syntax, refer to Page 59 in JVT X201wcm */
+typedef struct TagWelsPPS{
+	uint32_t	iSpsId;
+	uint32_t	iPpsId;
+		
+#if !defined(DISABLE_FMO_FEATURE)
+	uint32_t	uiNumSliceGroups;
+	uint32_t	uiSliceGroupMapType;
+	/* uiSliceGroupMapType = 0 */
+	uint32_t	uiRunLength[MAX_SLICEGROUP_IDS];
+	/* uiSliceGroupMapType = 2 */
+	uint32_t	uiTopLeft[MAX_SLICEGROUP_IDS];
+	uint32_t	uiBottomRight[MAX_SLICEGROUP_IDS];
+	/* uiSliceGroupMapType = 3, 4 or 5 */
+	/* uiSliceGroupMapType = 3, 4 or 5 */
+	bool_t		bSliceGroupChangeDirectionFlag;
+	uint32_t	uiSliceGroupChangeRate;
+	/* uiSliceGroupMapType = 6 */
+	uint32_t	uiPicSizeInMapUnits;
+	uint32_t	uiSliceGroupId[MAX_SLICEGROUP_IDS];
+#endif//!DISABLE_FMO_FEATURE
+	
+//	uint32_t	uiNumRefIdxL0Active;
+//	uint32_t	uiNumRefIdxL1Active;
+	
+	int8_t		iPicInitQp;
+	int8_t		iPicInitQs;
+	uint8_t		uiChromaQpIndexOffset;	
+	
+	/* potential application for High profile */
+//	int32_t		iSecondChromaQpIndexOffset;
+//	/* potential application for High profile */
+
+//	bool_t		bPicOrderPresentFlag;
+	
+	bool_t		bDeblockingFilterControlPresentFlag;
+	
+//	bool_t		bConstainedIntraPredFlag;
+//	bool_t		bRedundantPicCntPresentFlag;
+//	bool_t		bWeightedPredFlag;
+//	uint8_t		uiWeightedBiPredIdc;
+	
+} SWelsPPS, *PWelsPPPS;
+
+//#pragma pack()
+}
+
+#endif //WELS_PARAMETER_SETS_H__
--- /dev/null
+++ b/codec/encoder/core/inc/picture.h
@@ -1,0 +1,98 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//picture.h	-	reconstruction picture/ reference picture/ residual picture are declared here
+#ifndef WELS_PICTURE_H__
+#define WELS_PICTURE_H__
+
+#include "typedefs.h"
+#include "as264_common.h"
+#include "wels_common_basis.h"
+//#pragma pack(1)
+
+namespace WelsSVCEnc {
+
+/*
+ *	Reconstructed Picture definition
+ *	It is used to express reference picture, also consequent reconstruction picture for output
+ */
+typedef struct TagPicture{
+	/************************************payload pData*********************************/
+	uint8_t		*pBuffer;		// pointer to the first allocated byte, basical offset of pBuffer, dimension:
+	uint8_t		*pData[3];		// pointer to picture planes respectively
+	int32_t		iLineSize[3];	// iLineSize of picture planes respectively
+
+	// picture information
+	/*******************************from other standard syntax****************************/
+	/*from pSps*/
+	int32_t		iWidthInPixel;	// picture width in pixel
+	int32_t		iHeightInPixel;// picture height in pixel
+	int32_t		iPictureType;	// got from sSliceHeader(): eSliceType
+	int32_t		iFramePoc;		// frame POC
+
+	real32_t	fFrameRate;   // MOVE
+	int32_t		iFrameNum;		// frame number			//for pRef pic management
+
+	uint32_t	*uiRefMbType;	// for iMbWidth*iMbHeight	
+	uint8_t		*pRefMbQp;		// for iMbWidth*iMbHeight
+
+	int32_t     *pMbSkipSad;   //for iMbWidth*iMbHeight
+
+	SMVUnitXY	*sMvList;
+
+	/*******************************sef_definition for misc use****************************/
+	int32_t		iMarkFrameNum;
+	int32_t		iLongTermPicNum;
+
+	bool_t		bUsedAsRef;						//for pRef pic management
+	bool_t		bIsLongRef;	// long term reference frame flag	//for pRef pic management
+	uint8_t		uiRecieveConfirmed;
+	uint8_t		uiTemporalId;
+	uint8_t		uiSpatialId;	
+}SPicture;	
+
+/*
+ *	Residual Picture
+ */
+//typedef struct Rs_Picture_s{
+//	int16_t		*pBuffer[4];		// base pBuffer
+//	int16_t		*pData[4];		// pData pBuffer
+//	int32_t		real_linesize[4];// actual iLineSize of picture planes respectively
+//	int32_t		used_linesize[4];// iLineSize of picture planes respectively used currently
+//	int32_t		planes;			// planes of YUV
+//}Rs_Picture_t;
+
+//#pragma pack()
+
+}	// end of namespace WelsSVCEnc {
+
+#endif//WELS_PICTURE_H__
--- /dev/null
+++ b/codec/encoder/core/inc/picture_handle.h
@@ -1,0 +1,72 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	picture_handle.h
+ *
+ * \brief	picture pData handling
+ *
+ * \date	5/20/2009 Created
+ *
+ *************************************************************************************/
+#if !defined(WELS_ENCODER_PICTURE_HANDLE_H__)
+#define WELS_ENCODER_PICTURE_HANDLE_H__
+
+#include "picture.h"
+#include "typedefs.h"
+#include "memory_align.h"
+
+namespace WelsSVCEnc {
+/*!
+ * \brief	alloc picture pData with borders for each plane based width and height of picture
+ * \param	cx				width of picture in pixels
+ * \param	cy				height of picture in pixels
+ * \param	need_data		need pData allocation
+ * \pram	need_expand		need borders expanding
+ * \return	successful if effective picture pointer returned, otherwise failed with NULL
+ */
+SPicture *AllocPicture( CMemoryAlign *pMa, const int32_t kiWidth, const int32_t kiHeight, bool_t bNeedMbInfo );
+
+/*!
+ * \brief	free picture pData planes
+ * \param	pic		picture pointer to be destoryed
+ * \return	none
+ */
+void FreePicture( CMemoryAlign *pMa, SPicture **ppPic );
+
+/*!
+* \brief	exchange two picture pData planes
+* \param	ppPic1		picture pointer to picture 1
+* \param	ppPic2		picture pointer to picture 2
+* \return	none
+*/
+void WelsExchangeSpatialPictures( SPicture **ppPic1, SPicture **ppPic2 );
+}
+#endif//WELS_ENCODER_PICTURE_HANDLE_H__
--- /dev/null
+++ b/codec/encoder/core/inc/property.h
@@ -1,0 +1,80 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	property.h
+ *
+ * \brief	CODE name, library module and corresponding version are included
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_DECODER_PROPERTY_H__
+#define WELS_DECODER_PROPERTY_H__
+
+#include "typedefs.h"
+
+namespace WelsSVCEnc {
+
+
+/*!
+ * \brief	get code name
+ * \param	pBuf	pBuffer to restore code name
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetCodeName(str_t *pBuf, int32_t iSize);
+
+/*!
+ * \brief	get library/module name
+ * \param	pBuf	pBuffer to restore module name
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetLibName(str_t *pBuf, int32_t iSize);
+
+/*!
+ * \brief	get version number
+ * \param	pBuf	pBuffer to restore version number
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetVerNum(str_t *pBuf, int32_t iSize);
+
+/*!
+ * \brief	get identify information
+ * \param	pBuf	pBuffer to restore indentify information
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetIdentInfo(str_t *pBuf, int32_t iSize);
+}
+#endif//WELS_DECODER_PROPERTY_H__
--- /dev/null
+++ b/codec/encoder/core/inc/rc.h
@@ -1,0 +1,236 @@
+/*!
+ * \copy
+ *     Copyright (c)  2004-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  ratectl.c
+ *
+ *  Abstract
+ *      Include file for ratectl.c
+ *
+ *  History
+ *      9/8/2004 Created
+ *    12/26/2011 Modified
+ *
+ *
+ *************************************************************************/
+#ifndef _RC_H
+#define _RC_H
+
+
+#include "codec_app_def.h"
+#include "svc_enc_macroblock.h"
+#include "slice.h"
+
+namespace WelsSVCEnc {
+//trace
+#define GOM_TRACE_FLAG 1
+//skip frame
+#define SKIP_FRAME_FLAG      1
+
+#define    WELS_RC_DISABLE        0
+#define    WELS_RC_GOM            1
+
+typedef enum
+{
+	RC_MODE0,	//Quality mode
+	RC_MODE1,   //Bitrate mode
+}RC_MODES;
+
+enum {
+	//virtual gop size
+	VGOP_SIZE             = 8,
+
+	//qp information
+	GOM_MIN_QP_MODE       = 12,
+	GOM_MAX_QP_MODE       = 36,
+    MIN_IDR_QP            = 26,
+    MAX_IDR_QP            = 32,
+    DELTA_QP              = 2,
+    DELTA_QP_BGD_THD      = 3,
+
+	//frame skip constants
+    SKIP_QP_90P           = 24,
+    SKIP_QP_180P          = 24,
+    SKIP_QP_360P          = 31,
+    SKIP_QP_720P          = 31,
+    LAST_FRAME_QP_RANGE_UPPER_MODE0  = 3,
+	LAST_FRAME_QP_RANGE_LOWER_MODE0  = 2,
+    LAST_FRAME_QP_RANGE_UPPER_MODE1  = 5,
+	LAST_FRAME_QP_RANGE_LOWER_MODE1  = 3,
+
+	MB_WIDTH_THRESHOLD_90P   = 15,
+	MB_WIDTH_THRESHOLD_180P  = 30,
+	MB_WIDTH_THRESHOLD_360P  = 60,
+
+	//Mode 0 parameter
+	GOM_ROW_MODE0_90P     = 2,
+	GOM_ROW_MODE0_180P    = 2,
+	GOM_ROW_MODE0_360P    = 4,
+	GOM_ROW_MODE0_720P    = 4,
+    QP_RANGE_MODE0        = 3,
+
+	//Mode 1 parameter
+	GOM_ROW_MODE1_90P     = 1,
+	GOM_ROW_MODE1_180P    = 1,
+	GOM_ROW_MODE1_360P    = 2,
+	GOM_ROW_MODE1_720P    = 2,
+    QP_RANGE_UPPER_MODE1  = 9,
+	QP_RANGE_LOWER_MODE1  = 4,
+    QP_RANGE_INTRA_MODE1  = 3,
+};
+
+//bits allocation
+#define MAX_BITS_VARY_PERCENTAGE 100 //bits vary range in percentage
+#define VGOP_BITS_PERCENTAGE_DIFF 5
+#define IDR_BITRATE_RATIO  4.0
+#define FRAME_iTargetBits_VARY_RANGE 0.5
+//R-Q Model
+#define LINEAR_MODEL_DECAY_FACTOR 0.8
+#define FRAME_CMPLX_RATIO_RANGE 0.1
+#define SMOOTH_FACTOR_MIN_VALUE 0.02
+//#define VGOP_BITS_MIN_RATIO 0.8
+//skip and padding
+#define SKIP_RATIO  0.5
+#define PADDING_BUFFER_RATIO 0.5
+#define PADDING_THRESHOLD    0.05
+
+typedef struct TagRCSlicing
+{
+	int32_t   iComplexityIndexSlice;
+	int32_t   iCalculatedQpSlice;
+	int32_t   iStartMbSlice;
+	int32_t   iEndMbSlice;
+	int32_t   iTotalQpSlice;
+	int32_t   iTotalMbSlice;
+	int32_t   iTargetBitsSlice;
+	int32_t   iBsPosSlice;
+	int32_t   iFrameBitsSlice;
+	int32_t   iGomBitsSlice;
+	int32_t   iGomTargetBits;
+	//int32_t   gom_coded_mb;
+} SRCSlicing;
+
+typedef struct TagRCTemporal
+{
+	int32_t   iMinBitsTl;
+	int32_t   iMaxBitsTl;
+	double    dTlayerWeight;
+	int32_t   iGopBitsDq;
+	//P frame level R-Q Model 
+	double    dLinearCmplx;
+	int32_t   iPFrameNum;
+	int32_t   iFrameCmplxMean;
+
+} SRCTemporal;
+
+typedef struct TagWelsRc{
+	int32_t   iRcVaryPercentage;
+	double    dRcVaryRatio;
+
+	int32_t   iInitialQp; //initial qp
+	int32_t   iBitRate;
+	int32_t   iPreviousBitrate;
+	int32_t   iPreviousGopSize;
+	double    fFrameRate;
+	double    dBitsPerFrame;
+	double    dPreviousFps;
+
+	// bits allocation and status
+	int32_t   iRemainingBits;
+	int32_t   iTargetBits;
+
+	int32_t   iIdrNum;
+	int32_t   iIntraComplexity;
+	int32_t   iIntraMbCount;
+
+	int8_t    iTlOfFrames[VGOP_SIZE];
+	double    dRemainingWeights;
+	int32_t   iFrameDqBits;
+
+	double    *pGomComplexity;
+	int32_t	  *pGomForegroundBlockNum;
+	int32_t   *pCurrentFrameGomSad;
+	int32_t   *pGomCost;
+
+	int32_t   iAverageFrameQp;
+	int32_t   iNumberMbFrame;
+	int32_t   iNumberMbGom;
+	int32_t	  iSliceNum;
+	int32_t   iGomSize;
+
+	int32_t   iSkipFrameNum;
+	int32_t   iFrameCodedInVGop;
+	int32_t   iSkipFrameInVGop;
+	int32_t   iGopNumberInVGop;
+	int32_t   iGopIndexInVGop;
+
+	int32_t   iSkipQpValue;
+	int32_t   iQpRangeUpperInFrame;
+	int32_t   iQpRangeLowerInFrame;
+	int32_t   iMinQp;
+	int32_t   iMaxQp;
+	//int32_t   delta_adaptive_qp;
+	double    dSkipBufferRatio;
+
+	double    dQStep;
+	int32_t   iFrameDeltaQpUpper;
+	int32_t   iFrameDeltaQpLower;
+	int32_t   iLastCalculatedQScale;
+
+	//for skip frame and padding
+	int32_t   iBufferSizeSkip;
+	int32_t   iBufferFullnessSkip;
+	int32_t   iBufferSizePadding;
+	int32_t   iBufferFullnessPadding;
+	int32_t   iPaddingSize;
+	int32_t   iPaddingBitrateStat;
+
+	SRCSlicing	*pSlicingOverRc;
+	SRCTemporal *pTemporalOverRc;
+}SWelsSvcRc; 
+
+typedef  void (*PWelsRCPictureInitFunc) (void *pCtx);
+typedef  void (*PWelsRCPictureInfoUpdateFunc) (void *pCtx, int32_t iLayerSize);
+typedef  void (*PWelsRCMBInfoUpdateFunc)(void *pCtx, SMB * pCurMb, int32_t iCostLuma, SSlice *pSlice);
+typedef  void (*PWelsRCMBInitFunc)(void *pCtx, SMB * pCurMb, SSlice *pSlice);
+
+typedef  struct  WelsRcFunc_s
+{
+    PWelsRCPictureInitFunc			pfWelsRcPictureInit;
+	PWelsRCPictureInfoUpdateFunc	pfWelsRcPictureInfoUpdate;
+	PWelsRCMBInitFunc				pfWelsRcMbInit;
+	PWelsRCMBInfoUpdateFunc			pfWelsRcMbInfoUpdate;
+} SWelsRcFunc;
+
+void WelsRcInitModule(void *pCtx,  int32_t iModule);
+void WelsRcFreeMemory(void *pCtx);
+
+}
+#endif //_RC_H
--- /dev/null
+++ b/codec/encoder/core/inc/ref_list_mgr_svc.h
@@ -1,0 +1,108 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  ref_list_mgr_svc.h
+ *
+ *  Abstract
+ *      Interface for managing reference picture in svc encoder side
+ *
+ *  History
+ *		09/01/2008 Created
+ *		08/07/2009 Ported
+ *
+ *****************************************************************************/
+#if !defined(REFERENCE_PICTURE_LIST_MANAGEMENT_SVC_H__)
+#define REFERENCE_PICTURE_LIST_MANAGEMENT_SVC_H__
+
+#include "typedefs.h"
+#include "encoder_context.h"
+#include "codec_app_def.h"
+
+namespace WelsSVCEnc {
+typedef enum
+{
+	RECIEVE_UNKOWN = 0,
+	RECIEVE_SUCCESS = 1,
+	RECIEVE_FAILED = 2,
+}LTR_MARKING_RECEIVE_STATE;
+
+typedef enum
+{
+	LTR_DIRECT_MARK = 0,
+	LTR_DELAY_MARK = 1,
+}LTR_MARKING_PROCESS_MODE;
+
+typedef enum
+{
+	FRAME_NUM_EQUAL    = 0x01,
+	FRAME_NUM_BIGGER   = 0x02,
+	FRAME_NUM_SMALLER  = 0x04,
+	FRAME_NUM_OVER_MAX = 0x08,
+}COMPARE_FRAME_NUM;
+
+/*
+*	reset LTR marking , recovery ,feedback state to default
+*/
+void ResetLtrState(SLTRState* pLtr );
+/*
+ *	reset reference picture list
+ */
+void WelsResetRefList( sWelsEncCtx *pCtx );
+
+/*
+ *	update reference picture list
+ */
+BOOL_T WelsUpdateRefList( sWelsEncCtx *pCtx );	
+/*
+ *	build reference picture list
+ */
+BOOL_T WelsBuildRefList( sWelsEncCtx *pCtx, const int32_t kiPOC );
+
+/*
+ *	update syntax for reference base related
+ */
+void WelsUpdateRefSyntax( sWelsEncCtx *pCtx, const int32_t kiPOC, const int32_t kiFrameType );
+
+
+/*
+* check current mark iFrameNum used in LTR list or not
+*/
+bool_t CheckCurMarkFrameNumUsed(sWelsEncCtx *pCtx);
+/*
+*	decide whether current frame include long term reference mark and update long term reference mark syntax
+*/
+void WelsMarkPic( sWelsEncCtx *pCtx);
+
+#ifdef LONG_TERM_REF_DUMP
+void dump_ref(sWelsEncCtx* ctx);
+#endif
+}
+#endif//REFERENCE_PICTURE_LIST_MANAGEMENT_SVC_H__
--- /dev/null
+++ b/codec/encoder/core/inc/sample.h
@@ -1,0 +1,123 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef _SAMPLE_H_
+#define _SAMPLE_H_
+
+#include "typedefs.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+enum
+{
+    BLOCK_16x16 = 0,
+    BLOCK_16x8  = 1,
+    BLOCK_8x16  = 2,
+    BLOCK_8x8   = 3,
+    BLOCK_4x4   = 4,
+//    BLOCK_8x4   = 5,
+//    BLOCK_4x8   = 6,
+};
+
+//===================SAD=====================//
+int32_t WelsSampleSad16x16_c( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad16x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad8x16_c( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad8x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
+//int32_t WelsSampleSad8x4( uint8_t *, int32_t, uint8_t *, int32_t );
+//int32_t WelsSampleSad4x8( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad4x4_c( uint8_t *, int32_t, uint8_t *, int32_t );
+
+//======================SATD======================//
+int32_t WelsSampleSatd16x16_c( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd16x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd8x16_c( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd8x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
+//int32_t WelsSampleSatd8x4( uint8_t *, int32_t, uint8_t *, int32_t );
+//int32_t WelsSampleSatd4x8( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd4x4_c( uint8_t *, int32_t, uint8_t *, int32_t );
+
+void WelsSampleSadFour16x16_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad); 
+void WelsSampleSadFour16x8_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour8x16_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour8x8_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad);
+void WelsSampleSadFour4x4_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined (X86_ASM)
+
+int32_t WelsSampleSad4x4_mmx( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSad8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t);
+int32_t WelsSampleSad8x8_sse21( uint8_t *, int32_t, uint8_t * , int32_t);
+
+void WelsSampleSadFour16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
+void WelsSampleSadFour16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
+void WelsSampleSadFour8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
+void WelsSampleSadFour8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
+void WelsSampleSadFour4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t, int32_t* );
+
+int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t, uint8_t*, int32_t*, int32_t, int32_t, int32_t );
+
+int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t);
+int32_t WelsSampleSatd8x16_sse41( uint8_t * , int32_t, uint8_t *, int32_t);
+int32_t WelsSampleSatd16x8_sse41( uint8_t * , int32_t, uint8_t *, int32_t);
+int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t);
+int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+
+int32_t WelsIntra16x16Combined3Satd_sse41(uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntra16x16Combined3Sad_ssse3(uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*);
+int32_t WelsIntraChroma8x8Combined3Satd_sse41( uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*,uint8_t*,uint8_t*);
+int32_t WelsIntraChroma8x8Combined3Sad_ssse3( uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*,uint8_t*,uint8_t*);
+
+
+#endif//X86_ASM
+
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+void WelsInitSampleSadFunc( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag );
+
+}
+
+#endif //_SAMPLE_H_
--- /dev/null
+++ b/codec/encoder/core/inc/set_mb_syn_cavlc.h
@@ -1,0 +1,100 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	set_mb_syn_cavlc.h
+ *
+ * \brief	Seting all syntax elements of mb and decoding residual with cavlc
+ *
+ * \date	05/19/2009 Created
+ *
+ *************************************************************************************
+ */
+ 
+#ifndef SET_MB_SYN_CAVLC_H_
+#define SET_MB_SYN_CAVLC_H_
+
+#include "typedefs.h"
+#include "bit_stream.h"
+
+namespace WelsSVCEnc {
+//#pragma pack(1)
+
+
+
+enum EResidualProperty{
+    LUMA_DC     = 0,
+	LUMA_AC     = 1,
+	LUMA_4x4    = 2,
+	CHROMA_DC   = 3, 
+	CHROMA_AC   = 4    
+};
+
+
+#define LUMA_DC_AC    0x04
+
+typedef  int32_t  (*PCavlcParamCalFunc) ( int16_t * pCoff, uint8_t * pRun, int16_t * pLevel, int32_t * pTotalCoeffs, int32_t iEndIdx);
+
+typedef  struct TagCoeffFunc 
+{
+	PCavlcParamCalFunc    pfCavlcParamCal;
+} SCoeffFunc;
+
+/*  For CAVLC   */
+extern SCoeffFunc    sCoeffFunc;
+
+typedef struct TagCavlcTableItem
+{
+	uint16_t uiBits;
+	uint8_t  uiLen;
+	uint8_t  uiSuffixLength;
+} SCavlcTableItem;
+
+void  InitCoeffFunc( const uint32_t uiCpuFlag );
+
+void  InitCavlcTable();
+
+void  WriteBlockResidualCavlc( int16_t *pCoffLevel, int32_t iEndIdx, int32_t iCalRunLevelFlag, int32_t iResidualProperty, int8_t iNC, SBitStringAux *pBs );
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#ifdef  X86_ASM
+int32_t CavlcParamCal_sse2(int16_t*pCoffLevel, uint8_t* pRun, int16_t *pLevel, int32_t * pTotalCoeffs , int32_t iEndIdx); 
+#endif
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+}
+//#pragma pack()
+#endif
--- /dev/null
+++ b/codec/encoder/core/inc/slice.h
@@ -1,0 +1,185 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//wels_slice.h
+#ifndef WELS_SLICE_H__
+#define WELS_SLICE_H__
+
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "mb_cache.h"
+#include "picture.h"
+#include "parameter_sets.h"
+#include "svc_enc_slice_segment.h"
+#include "bit_stream.h"
+
+
+namespace WelsSVCEnc {
+
+/*******************************sub struct of slice header****************************/
+
+
+/*
+ *	Reference picture list reordering syntax, refer to page 64 in JVT X201wcm
+ */
+typedef struct TagRefPicListReorderSyntax {
+	struct
+	{
+		uint32_t	uiAbsDiffPicNumMinus1; //uiAbsDiffPicNumMinus1 SHOULD be in the range of [4, (1<<pSps->uiLog2MaxFrameNum)-1], {p104, JVT-X201wcm1}
+		                                     //but int8_t can't cover the range, SHOULD modify it.
+		uint16_t	iLongTermPicNum;
+		uint16_t	uiReorderingOfPicNumsIdc; //in order to pack 2-uint16_t into 1-(u)int32_t, so modify the type into uint16_t.
+	} SReorderingSyntax[MAX_REFERENCE_REORDER_COUNT_NUM];	// MAX_REF_PIC_COUNT
+}SRefPicListReorderSyntax;
+
+		
+/* Decoded reference picture marking syntax, refer to Page 66 in JVT X201wcm */
+typedef struct TagRefPicMarking {
+	struct
+	{
+		int32_t	iMmcoType;
+		int32_t iShortFrameNum;
+		int32_t	iDiffOfPicNum;
+		int32_t	iLongTermPicNum;
+		int32_t	iLongTermFrameIdx;
+		int32_t	iMaxLongTermFrameIdx;
+	} SMmcoRef[MAX_REFERENCE_MMCO_COUNT_NUM];	// MAX_MMCO_COUNT
+	
+	//	int32_t		mmco_index;
+	uint8_t		uiMmcoCount;
+	bool_t		bNoOutputOfPriorPicsFlag;
+	bool_t		bLongTermRefFlag;
+	bool_t		bAdaptiveRefPicMarkingModeFlag;	
+} SRefPicMarking;
+
+
+/* Header of slice syntax elements, refer to Page 63 in JVT X201wcm */
+typedef struct TagSliceHeader{	
+	/*****************************slice header syntax and generated****************************/
+	int32_t		iFirstMbInSlice;		
+//	uint32_t	pic_parameter_set_id;
+	int32_t		iFrameNum;	
+	int32_t		iPicOrderCntLsb;
+    
+//	int32_t		delta_pic_order_cnt_bottom;
+//	int32_t		delta_pic_order_cnt[2];
+//	int32_t		redundant_pic_cnt;
+		
+	EWelsSliceType	eSliceType;
+	uint8_t		uiNumRefIdxL0Active;			//
+	//int32_t		num_ref_idx_l1_active_minus1	//B frame is not supported
+	uint8_t		uiRefCount;
+	//Ref_Pic				*ref_pic;
+	uint8_t		uiRefIndex;	// exact reference picture index for slice	
+	
+	int8_t		iSliceQpDelta;
+//	int32_t		slice_qp;	
+//	int32_t		slice_qs_delta;		// For SP/SI slices
+	uint8_t		uiDisableDeblockingFilterIdc;
+	int8_t		iSliceAlphaC0Offset;
+	int8_t		iSliceBetaOffset;
+#if !defined(DISABLE_FMO_FEATURE)
+	int32_t		iSliceGroupChangeCycle;
+#endif//!DISABLE_FMO_FEATURE
+
+	SWelsSPS			*pSps;
+	SWelsPPS			*pPps;
+	int32_t		iSpsId;
+	int32_t		iPpsId;
+
+	uint16_t    uiIdrPicId;	
+//	uint8_t		color_plane_id;//from?
+
+	bool_t		bNumRefIdxActiveOverrideFlag;
+//	bool_t		field_pic_flag;		//not supported in base profile
+//	bool_t		bottom_field_flag;		//not supported in base profile
+	uint8_t		uiPadding1Bytes;
+
+	SRefPicMarking		sRefMarking;	// Decoded reference picture marking syntaxs
+
+	SRefPicListReorderSyntax	sRefReordering;	// Reference picture list reordering syntaxs
+}SSliceHeader, *PSliceHeader;
+
+
+/* SSlice header in scalable extension syntax, refer to Page 394 in JVT X201wcm */
+typedef struct TagSliceHeaderExt{	
+	SSliceHeader	sSliceHeader;
+
+	SSubsetSps	*pSubsetSps;
+	
+	uint32_t	uiNumMbsInSlice;	
+	
+	bool_t		bStoreRefBasePicFlag;	
+	bool_t		bConstrainedIntraResamplingFlag;	
+	bool_t		bSliceSkipFlag;
+	
+	bool_t		bAdaptiveBaseModeFlag;
+	bool_t		bDefaultBaseModeFlag;
+	bool_t		bAdaptiveMotionPredFlag;
+	bool_t		bDefaultMotionPredFlag;
+
+	bool_t		bAdaptiveResidualPredFlag;
+	bool_t		bDefaultResidualPredFlag;
+	bool_t		bTcoeffLevelPredFlag;		
+	uint8_t		uiDisableInterLayerDeblockingFilterIdc;
+	
+}SSliceHeaderExt, *PSliceHeaderExt;
+
+
+typedef struct TagSlice{	
+	// mainly for multiple threads imp.
+	SMbCache	sMbCacheInfo;	// MBCache is introduced within slice dependency
+	SBitStringAux *pSliceBsa;
+
+	/*******************************sSliceHeader****************************/
+	SSliceHeaderExt	sSliceHeaderExt;	
+
+
+	SMVUnitXY	sMvMin;
+	SMVUnitXY	sMvMax;	
+	SMVUnitXY	sMvc[5];
+	uint8_t		uiMvcNum;
+	uint8_t		sScaleShift;
+
+	uint8_t		uiSliceIdx;
+	bool_t		bSliceHeaderExtFlag; // Indicate which slice header is used, avc or ext?	
+	uint8_t		uiLastMbQp;		// stored qp for last mb coded, maybe more efficient for mb skip detection etc.
+
+	bool_t		bDynamicSlicingSliceSizeCtrlFlag;
+	uint8_t		uiAssumeLog2BytePerMb;
+	uint8_t		uiReservedFillByte;	// reserved to meet 4 bytes alignment
+}SSlice, *PSlice;
+
+}
+//#pragma pack()
+#endif//WELS_SLICE_H__
--- /dev/null
+++ b/codec/encoder/core/inc/slice_multi_threading.h
@@ -1,0 +1,127 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	slice_multi_threading.c
+ *
+ * \brief	slice based multiple threading
+ *
+ * \date	04/16/2010 Created
+ *
+ *************************************************************************************
+ */
+#ifndef SVC_SLICE_MULTIPLE_THREADING_H__
+#define SVC_SLICE_MULTIPLE_THREADING_H__
+
+#if defined(MT_ENABLED)
+
+#include "typedefs.h"
+#include "codec_app_def.h"
+#include "param_svc.h"
+#include "encoder_context.h"
+#include "svc_enc_frame.h"
+#include "svc_enc_macroblock.h"
+#include "svc_enc_slice_segment.h"
+#include "WelsThreadLib.h"
+
+namespace WelsSVCEnc {
+void UpdateMbListNeighborParallel(	SSliceCtx *pSliceCtx,
+										SMB *pMbList,
+										const int32_t kiSliceIdc	);
+
+void CalcSliceComplexRatio( void *pRatio, SSliceCtx *pSliceCtx, uint32_t *pSliceConsume );
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(NOT_ABSOLUTE_BALANCING)
+int32_t NeedDynamicAdjust( void *pConsumeTime, const int32_t kiSliceNum );
+#endif//..
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+void DynamicAdjustSlicing(	sWelsEncCtx *pCtx,
+								SDqLayer *pCurDqLayer,
+								void *pComplexRatio,
+								int32_t iCurDid );
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+void reset_env_mt( sWelsEncCtx *pCtx );
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+
+int32_t RequestMtResource( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pParam, const int32_t kiCountBsLen, const int32_t kiTargetSpatialBsSize );
+
+void ReleaseMtResource( sWelsEncCtx **ppCtx );
+
+int32_t AppendSliceToFrameBs( sWelsEncCtx *pCtx, SLayerBSInfo *pLbi, const int32_t kiSliceCount );
+int32_t WriteSliceToFrameBs( sWelsEncCtx *pCtx, SLayerBSInfo *pLbi, uint8_t *pFrameBsBuffer, const int32_t kiSliceIdx );
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(__GNUC__)
+WELS_THREAD_ROUTINE_TYPE UpdateMbListThreadProc( void *arg );
+#endif//__GNUC__
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+WELS_THREAD_ROUTINE_TYPE CodingSliceThreadProc( void *arg );
+
+int32_t CreateSliceThreads( sWelsEncCtx *pCtx );
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+void ResetCountBsSizeInPartitions( uint32_t *pCountBsSizeList, const int32_t kiPartitionCnt );
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+#ifdef WIN32
+int32_t FiredSliceThreads( SSliceThreadPrivateData *pPriData, WELS_EVENT *pEventsList, SLayerBSInfo *pLayerBsInfo, const uint32_t kuiNumThreads/*, int32_t *iLayerNum*/, SSliceCtx *pSliceCtx, const BOOL_T kbIsDynamicSlicingMode );
+#else
+int32_t FiredSliceThreads( SSliceThreadPrivateData *pPriData, WELS_EVENT **ppEventsList, SLayerBSInfo *pLayerBsInfo, const uint32_t kuiNumThreads/*, int32_t *iLayerNum*/, SSliceCtx *pSliceCtx, const BOOL_T kbIsDynamicSlicingMode );
+#endif//WIN32
+
+int32_t DynamicDetectCpuCores();
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
+
+int32_t AdjustBaseLayer( sWelsEncCtx *pCtx );
+int32_t AdjustEnhanceLayer( sWelsEncCtx *pCtx, int32_t iCurDid );
+
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
+
+#if defined(MT_ENABLED)
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE) && defined(MT_DEBUG)
+void TrackSliceComplexities( sWelsEncCtx *pCtx, const int32_t kiCurDid );
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(MT_DEBUG)
+void TrackSliceConsumeTime( sWelsEncCtx *pCtx, int32_t *pDidList, const int32_t kiSpatialNum );
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(MT_DEBUG)
+
+#endif//MT_ENABLED
+}
+#endif//MT_ENABLED
+
+#endif//SVC_SLICE_MULTIPLE_THREADING_H__
+
--- /dev/null
+++ b/codec/encoder/core/inc/stat.h
@@ -1,0 +1,98 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	stat.h
+ *
+ * \brief	statistical pData information
+ *
+ * \date	4/22/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_ENCODER_STATISTICAL_DATA_H__)
+#define WELS_ENCODER_STATISTICAL_DATA_H__
+
+/*
+ *	Stat quality 
+ */
+typedef struct TagStatQuality {
+	
+	real32_t	rYPsnr[5];
+	real32_t	rUPsnr[5];
+	real32_t	rVPsnr[5];
+
+} SStatQuality;
+
+/*
+ *	Stat complexity pData
+ */
+typedef struct TagComplexityStat {
+
+#ifdef FME_TEST
+	int32_t		cost_time;
+	int32_t		me_time;
+	int32_t		mvp_time;
+	int32_t		mvb_time;
+#endif
+
+	// any else?
+
+} SComplexityStat;
+
+/*
+ *	Stat slice details information
+ */
+typedef struct TagStatSliceInfo {
+	
+	/* per slice info */
+	int32_t		iSliceCount[5];
+	int32_t		iSliceSize [5];
+	int32_t		iMbCount   [5][18];
+
+} SStatSliceInfo;
+
+/*
+ *	For overall statistical pData
+ */
+typedef struct TagStatData {
+
+	// Quality
+	SStatQuality		sQualityStat;
+	
+	// Complexity
+	SComplexityStat		sComplexityStat;
+
+	// SSlice information output
+	SStatSliceInfo		sSliceData;	
+
+} SStatData;
+
+#endif//WELS_ENCODER_STATISTICAL_DATA_H__
--- /dev/null
+++ b/codec/encoder/core/inc/svc_base_layer_md.h
@@ -1,0 +1,99 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_base_layer_md.h
+ *
+ * \brief	mode decision 
+ *
+ * \date	2009.08.10 Created
+ *
+ *************************************************************************************
+ */
+#ifndef SVC_BASE_LAYER_MACROBLOCK_MODE_DECISION_H__
+#define SVC_BASE_LAYER_MACROBLOCK_MODE_DECISION_H__
+
+#include "md.h"
+#include "mb_cache.h"
+
+namespace WelsSVCEnc {
+void WelsMdIntraInit(sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache *pMbCache, const int32_t kiSliceFirstMbXY );
+int32_t WelsMdI16x16(SWelsFuncPtrList *pFunc, SDqLayer *pCurDqLayer, SMbCache *pMbCache, int32_t iLambda);
+int32_t WelsMdIntraChroma(SWelsFuncPtrList *pFunc, SDqLayer *pCurDqLayer, SMbCache *pMbCache, int32_t iLambda);
+
+int32_t WelsMdI4x4(void* pEnc,void* pMd, SMB* pCurMb, SMbCache *pMbCache);
+int32_t WelsMdI4x4Fast(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache);
+
+int32_t WelsMdIntraFinePartition(void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
+int32_t WelsMdIntraFinePartitionVaa(void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
+
+void WelsMdIntraMb(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
+
+void WelsMdBackgroundMbEnc(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache, SSlice *pSlice, bool_t bSkipMbFlag);
+BOOL_T WelsMdPSkipEnc(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache);
+int32_t WelsMdP16x16(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb);
+
+int32_t WelsMdP16x8(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice);
+int32_t WelsMdP8x16(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice);
+int32_t WelsMdP8x8(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice);
+/*static*/  void WelsMdInterInit( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, const int32_t kiSliceFirstMbXY );
+/*static*/ void WelsMdInterFinePartition(void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, int32_t bestCost);
+/*static*/ void WelsMdInterFinePartitionVaa( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, int32_t bestCost );
+void WelsMdInterMbRefinement(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
+BOOL_T WelsMdFirstIntraMode(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache);
+//BOOL_T svc_md_first_intra_mode_constrained(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache);
+void WelsMdInterMb(void* pEncCtx, void* pWelsMd, SSlice *pSlice, SMB* pCurMb );
+
+//both used in BL and EL
+//void wels_md_inter_init ( SWelsMD* pMd, const uint8_t ref_idx, const bool_t is_highest_dlayer_flag );
+
+bool_t WelsMdInterJudgeBGDPskip         ( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* bKeepSkip );
+bool_t WelsMdInterJudgeBGDPskipFalse( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* bKeepSkip );
+
+void WelsMdInterUpdateBGDInfo          ( SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t kbCollocatedPredFlag, const int32_t kiRefPictureType );
+void WelsMdInterUpdateBGDInfoNULL ( SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t kbCollocatedPredFlag, const int32_t kiRefPictureType );
+
+bool_t WelsMdInterJudgePskip( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T bTrySkip );
+void WelsMdInterUpdatePskip( SDqLayer* pCurDqLayer, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache );
+void WelsMdInterDecidedPskip( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache );
+
+void WelsMdInterDoubleCheckPskip( SMB* pCurMb, SMbCache *pMbCache );
+void WelsMdInterEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache );
+
+void WelsMdInterSaveSadAndRefMbType( Mb_Type* pRefMbTypeList, SMbCache * pMbCache, const SMB*  kpCurMb, const SWelsMD* kpMd );
+
+void WelsMdInterSecondaryModesEnc( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, const BOOL_T kbSkip );
+void WelsMdIntraSecondaryModesEnc( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache );
+//end of: both used in BL and EL
+
+//typedef void (*MD_INTRA_MB_BASE) (sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb);
+
+}
+#endif//WELS_MACROBLOCK_MODE_DECISION_H__
--- /dev/null
+++ b/codec/encoder/core/inc/svc_config.h
@@ -1,0 +1,45 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef SVC_CONFIGURATION_H_
+#define SVC_CONFIGURATION_H_
+
+#include "as264_common.h"
+
+#if defined (WIN32)
+#if !defined CODEC_TRACE_WIN32
+#define CODEC_TRACE_WIN32
+#endif//CODEC_TRACE_WIN32
+#endif//WIN32
+
+#endif//SVC_CONFIGURATION_H_
+
--- /dev/null
+++ b/codec/encoder/core/inc/svc_enc_frame.h
@@ -1,0 +1,113 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//wels_svc_layer.h
+#ifndef WELS_SVC_EXTENSION_LAYER_H__
+#define WELS_SVC_EXTENSION_LAYER_H__
+
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "parameter_sets.h"
+#include "nal_prefix.h"
+#include "slice.h"
+#include "picture.h"
+#include "svc_enc_macroblock.h"
+#include "bit_stream.h"
+
+
+#include "svc_enc_slice_segment.h"
+namespace WelsSVCEnc {
+/*
+ *	Need fine adjust below structure later for SVC extension optimization
+ */
+
+
+/*
+ *	Frame level in SVC DQLayer instead.
+ *	Dependency-Quaility layer struction definition for SVC extension of H.264/AVC
+ */
+
+///////////////////////////////////DQ Layer level///////////////////////////////////
+
+typedef struct TagDqLayer	SDqLayer;
+typedef SDqLayer *			pDqLayer;
+
+typedef struct TagLayerInfo{
+	SNalUnitHeaderExt		sNalHeaderExt;
+	SSlice					*pSliceInLayer;// Here SSlice identify to Frame on concept, [iSliceIndex], need memory block external side	for MT
+	SSubsetSps				*pSubsetSpsP;	// current pSubsetSps used, memory alloc in external
+	SWelsSPS						*pSpsP;		// current pSps based avc used, memory alloc in external
+	SWelsPPS						*pPpsP;		// current pPps used
+} SLayerInfo;
+/* Layer Representation */
+struct TagDqLayer{
+	SLayerInfo				sLayerInfo;
+	
+	uint8_t					*pCsData[3];	// pointer to reconstructed picture pData
+	int32_t					iCsStride[3];	// Cs stride
+
+	uint8_t					*pEncData[3];	// pData picture to be encoded in current layer
+	int32_t					iEncStride[3];	// pData picture stride
+
+	SMB*					sMbDataP;		// pointer to mb of mbAddr equal to 0 in slice, mb_data_ptr = mb_base_ptr + (1+iMbStride).	
+	int16_t					iMbWidth;		// MB width of this picture, equal to pSps.iMbWidth
+	int16_t					iMbHeight;		// MB height of this picture, equal to pSps.iMbHeight;
+
+	bool_t					bBaseLayerAvailableFlag;	// whether base layer is available for prediction?
+	uint8_t					iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
+	int8_t					iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
+	int8_t					iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0
+	uint8_t				    uiDisableInterLayerDeblockingFilterIdc;
+	int8_t					iInterLayerSliceAlphaC0Offset;
+	int8_t					iInterLayerSliceBetaOffset;	
+	bool_t					bDeblockingParallelFlag; //parallel_deblocking_flag
+
+	SPicture				*pRefPic;			// reference picture pointer
+	SPicture				*pDecPic;			// reconstruction picture pointer for layer
+
+	SSliceCtx			*pSliceEncCtx;	// current slice context
+	
+	int32_t					*pNumSliceCodedOfPartition;		// for dynamic slicing mode
+	int32_t					*pLastCodedMbIdxOfPartition;	// for dynamic slicing mode
+	int32_t					*pLastMbIdxOfPartition;			// for dynamic slicing mode
+
+	SDqLayer				*pRefLayer;		// pointer to referencing dq_layer of current layer to be decoded	
+
+};
+
+///////////////////////////////////////////////////////////////////////
+
+// frame structure for svc
+typedef SDqLayer	SWelsSvcFrame;
+}
+#endif//WELS_SVC_EXTENSION_LAYER_H__
--- /dev/null
+++ b/codec/encoder/core/inc/svc_enc_golomb.h
@@ -1,0 +1,278 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	golomb.h
+ *
+ * \brief	Exponential Golomb entropy coding/decoding routine
+ *
+ * \date	03/13/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_EXPONENTIAL_GOLOMB_ENTROPY_CODING_H__
+#define WELS_EXPONENTIAL_GOLOMB_ENTROPY_CODING_H__
+
+#include "typedefs.h"
+#include "bit_stream.h"
+#include "macros.h"
+
+namespace WelsSVCEnc {
+
+/************************************************************************/
+/* GOLOMB CODIMG FOR WELS ENCODER                                       */
+/************************************************************************/
+
+/*
+ *	Exponential Golomb codes encoding routines
+ */
+
+#define    CAVLC_BS_INIT( pBs )  \
+	uint8_t  * pBufPtr = pBs->pBufPtr; \
+	uint32_t   uiCurBits = pBs->uiCurBits; \
+	int32_t    iLeftBits = pBs->iLeftBits; 
+
+#define    CAVLC_BS_UNINIT( pBs ) \
+	pBs->pBufPtr = pBufPtr;  \
+	pBs->uiCurBits = uiCurBits;  \
+	pBs->iLeftBits = iLeftBits;
+   
+#define    CAVLC_BS_WRITE( n,  v ) \
+	{  \
+	if ( (n) < iLeftBits ) {\
+	    uiCurBits = (uiCurBits<<(n))|(v);\
+		iLeftBits -= (n);\
+	}\
+	else {\
+	    (n) -= iLeftBits;\
+		uiCurBits = (uiCurBits<<iLeftBits) | ((v)>>(n));\
+		*((uint32_t*)pBufPtr) = ENDIAN_FIX(uiCurBits);\
+		pBufPtr += 4;\
+		uiCurBits = (v) & ((1<<(n))-1);\
+		iLeftBits = 32 - (n);\
+	}\
+	} ;  
+
+extern const uint32_t g_uiGolombUELength[256];
+
+
+/*
+ *	Get size of unsigned exp golomb codes
+ */
+static inline uint32_t BsSizeUE( const uint32_t kiValue )
+{
+	if ( 256 > kiValue )
+	{
+		return g_uiGolombUELength[kiValue];	
+	}
+	else
+	{
+		uint32_t n = 0;	
+		uint32_t iTmpValue = kiValue+1;
+		
+		if (iTmpValue & 0xffff0000) 
+		{
+			iTmpValue >>= 16;
+			n += 16;
+		}
+		if (iTmpValue & 0xff00) 
+		{
+			iTmpValue >>= 8;
+			n += 8;
+		}
+		
+		//n += (g_uiGolombUELength[iTmpValue] >> 1);
+		n += (g_uiGolombUELength[iTmpValue-1] >> 1);
+		return ((n<<1) + 1);
+		
+	}
+}
+
+/*
+ *	Get size of signed exp golomb codes
+ */
+static inline uint32_t BsSizeSE( const int32_t kiValue )
+{
+	uint32_t iTmpValue;
+	if ( 0 == kiValue )
+	{
+		return 1;
+	}
+	else if ( 0 < kiValue )
+	{
+		iTmpValue = (kiValue<<1) - 1;
+		return BsSizeUE( iTmpValue );
+	}
+	else
+	{
+		iTmpValue = ((-kiValue)<<1);
+		return BsSizeUE( iTmpValue );
+	}
+}
+
+/*
+ *	Get size of truncated exp golomb codes
+ */
+static inline int32_t BsSizeTE( const int32_t kiX, const int32_t kiValue )
+{
+	return 0;
+}
+
+
+
+static inline int32_t BsWriteBits( SBitStringAux *pBs, int32_t n, const uint32_t kuiValue )
+{  
+	if( n < pBs->iLeftBits ){
+		pBs->uiCurBits = (pBs->uiCurBits<<n) | kuiValue;
+		pBs->iLeftBits -= n;	
+	} else {
+	    n -= pBs->iLeftBits;
+		pBs->uiCurBits = (pBs->uiCurBits<<pBs->iLeftBits) | (kuiValue>>n);
+		*((uint32_t*)pBs->pBufPtr) = ENDIAN_FIX(pBs->uiCurBits);		
+		pBs->pBufPtr += 4;
+		pBs->uiCurBits = kuiValue & ((1<<n)-1);
+		pBs->iLeftBits = 32 - n;
+	}
+	return 0;
+}
+
+/*
+ *	Write 1 bit
+ */
+static inline int32_t BsWriteOneBit( SBitStringAux *pBs, const uint32_t kuiValue )
+{
+	BsWriteBits(pBs, 1, kuiValue);
+	
+	return 0;
+}
+
+
+static inline void BsFlush(SBitStringAux * pBs)
+{
+    *(uint32_t*)pBs->pBufPtr = ENDIAN_FIX(pBs->uiCurBits << pBs->iLeftBits);
+	pBs->pBufPtr += 4 - pBs->iLeftBits/8;
+	pBs->iLeftBits = 32;
+	pBs->uiCurBits = 0;	//  for future writing safe, 5/19/2010
+}
+
+/*
+ *	Write unsigned exp golomb codes
+ */
+static inline void BsWriteUE( SBitStringAux *pBs, const uint32_t kuiValue )
+{
+	if ( 256 > kuiValue )	{
+		BsWriteBits( pBs, g_uiGolombUELength[kuiValue], kuiValue+1 );
+	}
+	else
+	{
+		uint32_t n = 0;	
+		uint32_t iTmpValue = kuiValue + 1;
+		
+		if (iTmpValue & 0xffff0000) 
+		{
+			iTmpValue >>= 16;
+			n += 16;
+		}
+		if (iTmpValue & 0xff00) 
+		{
+			iTmpValue >>= 8;
+			n += 8;
+		}
+
+		//n += (g_uiGolombUELength[iTmpValue] >> 1);
+
+		n += (g_uiGolombUELength[iTmpValue-1] >> 1);
+		BsWriteBits( pBs, (n<<1) + 1, kuiValue+1 );
+	}
+	return;
+}
+
+/*
+ *	Write signed exp golomb codes
+ */
+static inline void BsWriteSE( SBitStringAux *pBs, int32_t iValue )
+{	
+	uint32_t iTmpValue;
+	if ( 0 == iValue )
+	{
+		BsWriteOneBit( pBs, 1 );
+	}
+	else if ( 0 < iValue )
+	{
+		iTmpValue = (iValue<<1) - 1;
+		BsWriteUE( pBs, iTmpValue );
+	}
+	else
+	{
+		iTmpValue = ((-iValue)<<1);
+		BsWriteUE( pBs, iTmpValue );
+	}
+	return;
+}
+
+/*
+ *	Write truncated exp golomb codes
+ */
+static inline void BsWriteTE( SBitStringAux *pBs, const int32_t kiX, const uint32_t kuiValue )
+{
+	if ( 1 == kiX )
+	{
+		BsWriteOneBit( pBs, !kuiValue );
+	}
+	else
+	{
+		BsWriteUE( pBs, kuiValue );
+	}
+}
+
+
+/*
+ *	Write RBSP trailing bits
+ */
+static inline void BsRbspTrailingBits( SBitStringAux *pBs )
+{
+	BsWriteOneBit(pBs, 1);	
+	BsFlush(pBs);	
+}
+
+
+static inline BOOL_T   BsCheckByteAlign( SBitStringAux * pBs)
+{
+    return !(pBs->iLeftBits & 0x7);
+}
+
+
+static inline int32_t BsGetBitsPos( SBitStringAux *pBs )
+{
+	return ( ((pBs->pBufPtr - pBs->pBuf) << 3) + 32 - pBs->iLeftBits );
+}
+
+}
+#endif//WELS_EXPONENTIAL_GOLOMB_ENTROPY_CODING_H__
--- /dev/null
+++ b/codec/encoder/core/inc/svc_enc_macroblock.h
@@ -1,0 +1,77 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//macroblock.h
+#ifndef WELS_MACROBLOCK_H__
+#define WELS_MACROBLOCK_H__
+
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "macros.h"
+
+namespace WelsSVCEnc {
+
+//struct Mb_s;
+
+/* MB syntax and context, refer to Page 399 in JVT X201wcm */
+// keep the most essential level pData structure be 64 Bytes, which matches cache line size; if so, the order with structure maybe negligible. 
+// pls take care when modify MB structure size
+typedef struct TagMB{	
+	/*************************mb_layer() syntax and generated********************************/
+	/*mb_layer():*/
+	Mb_Type		uiMbType;	// including MB detailed partition type, number and type of reference list
+	int16_t		iMbXY;		// offset position of MB top left point based	
+	int16_t		iMbX;		// position of MB in horizontal axis
+	int16_t		iMbY;		// position of MB in vertical axis
+
+	uint8_t		uiNeighborAvail;	// avail && same_slice: LEFT_MB_POS:0x01, TOP_MB_POS:0x02, TOPRIGHT_MB_POS = 0x04 ,TOPLEFT_MB_POS = 0x08;
+	uint8_t		uiCbp;	
+
+	SMVUnitXY	*sMv;
+	int8_t		*pRefIndex;
+
+	int32_t     *pSadCost;				// mb sad. set to 0 for intra mb
+	int8_t      *pIntra4x4PredMode;	// [MB_BLOCK4x4_NUM]
+	int8_t      *pNonZeroCount;		// [MB_LUMA_CHROMA_BLOCK4x4_NUM]
+
+	SMVUnitXY	sP16x16Mv;
+
+	uint8_t		uiLumaQp;		// uiLumaQp: pPps->iInitialQp + sSliceHeader->delta_qp + mb->dquant.
+	uint8_t		uiChromaQp;	
+	uint8_t		uiSliceIdc;	// AVC: pFirstMbInSlice?; SVC: (pFirstMbInSlice << 7) | ((uiDependencyId << 4) | uiQualityId);
+	uint8_t		reserved_filling_bytes[1];	// filling bytes reserved to make structure aligned with 4 bytes, higher cache hit on less structure size by 2 cache lines( 2 * 64 bytes) once hit
+}SMB, *PMb;
+
+}
+
+#endif//WELS_MACROBLOCK_H__
--- /dev/null
+++ b/codec/encoder/core/inc/svc_enc_slice_segment.h
@@ -1,0 +1,225 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	slice_segment.h
+ *
+ * \brief	SSlice segment routine (Single slice/multiple slice/fmo arrangement exclusive)
+ *
+ * \date	2/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_SLICE_SEGMENT_H__
+#define WELS_SLICE_SEGMENT_H__
+
+#include "typedefs.h"
+#include "macros.h"
+#include "as264_common.h"
+#include "memory_align.h"
+
+#include "codec_app_def.h"
+namespace WelsSVCEnc {
+/*! 
+ * \brief	SSlice mode
+ */
+typedef uint16_t SliceMode;
+typedef enum{
+	SM_SINGLE_SLICE         = 0,
+	SM_FIXEDSLCNUM_SLICE	= 1,
+	SM_RASTER_SLICE			= 2,
+	SM_ROWMB_SLICE			= 3,
+	SM_DYN_SLICE			= 4,
+	SM_RESERVED				= 5
+}SliceModeEnum;
+
+
+// NOTE:
+// if PREFIX_NALs are used in base layer(iDid=0, qid=0), MAX_SLICES_NUM will be half of MAX_NAL_UNITS_IN_LAYER in case ST or MT without PACKING_ONE_SLICE_PER_LAYER
+// in case MT and PACKING_ONE_SLICE_PER_LAYER, MAX_SLICES_NUM should not be exceeding MAX_LAYER_NUM_OF_FRAME
+// for AVC cases, maximal resolution we can support up to (?x1024) for SM_ROWMB_SLICE slice mode
+// fine solution for MAX_SLICES_NUM, need us use the variable instead of MACRO for any resolution combining any multiple-slice mode adaptive
+#define SAVED_NALUNIT_NUM			( (MAX_SPATIAL_LAYER_NUM*MAX_QUALITY_LAYER_NUM) + 1 + MAX_SPATIAL_LAYER_NUM ) // SPS/PPS + SEI/SSEI + PADDING_NAL
+#define MAX_SLICES_NUM				( ( MAX_NAL_UNITS_IN_LAYER - SAVED_NALUNIT_NUM ) / 3 )	// Also MAX_SLICES_NUM need constrained by implementation: uiSliceIdc allocated in SSliceCtx.pOverallMbMap need a byte range as expected
+#define AVERSLICENUM_CONSTRAINT		(MAX_SLICES_NUM)			// used in sNalList initialization, 
+
+#define MIN_NUM_MB_PER_SLICE		48							// (128/16 * 96/16), addressing the lowest resolution for multiple slicing is 128x96 above
+
+#define DEFAULT_MAXPACKETSIZE_CONSTRAINT	(1200)		//in bytes
+//#define MINPACKETSIZE_CONSTRAINT			(1200)
+
+#define AVER_MARGIN_BYTES						( 100 ) //in bytes
+#define JUMPPACKETSIZE_CONSTRAINT(max_byte)			( max_byte - AVER_MARGIN_BYTES ) //in bytes
+#define JUMPPACKETSIZE_JUDGE(len,mb_idx,max_byte)	 ( (len) > JUMPPACKETSIZE_CONSTRAINT(max_byte) ) //( (mb_idx+1)%40/*16slice for compare*/ == 0 )	//
+//cur_mb_idx is for early tests, can be omit in optimization
+
+typedef struct TagSliceArgument{
+	uint32_t			uiSliceMbNum[MAX_SLICES_NUM];   //will perform check on this array to decide specific slicing, see note	
+	uint32_t			uiSliceSizeConstraint;
+	int16_t				iSliceNum;
+} SSliceArgument;
+
+typedef struct TagMulSliceOption{ //interfaces about slicing from application layer	
+	SSliceArgument		sSliceArgument; //according to uiSliceMode, decide which elements of this structure will actually takes effect
+	SliceMode			uiSliceMode;
+} SMulSliceOption;
+
+/*! 
+ * \brief	SSlice context
+ */
+/* Single/multiple slices */	
+typedef struct SlicepEncCtx_s{
+	SliceMode		uiSliceMode;			/* 0: single slice in frame; 1: multiple slices in frame; */
+	int16_t			iMbWidth;			/* width of picture size in mb */
+	int16_t			iMbHeight;			/* height of picture size in mb */
+	int16_t			iSliceNumInFrame;	/* count number of slices in frame; */
+	int32_t			iMbNumInFrame;	/* count number of MBs in frame */
+	uint8_t			*pOverallMbMap;	/* overall MB map in frame, store virtual slice idc; */	
+	int16_t			*pFirstMbInSlice;	/* first MB address top-left based in every slice respectively; */
+	int32_t			*pCountMbNumInSlice;	/* count number of MBs in every slice respectively; */
+	uint32_t		uiSliceSizeConstraint;/*in byte*/
+	int32_t			iMaxSliceNumConstraint;/*maximal number of slices constraint*/
+} SSliceCtx;
+
+
+typedef struct TagDynamicSlicingStack{
+	int32_t		iStartPos;	
+	int32_t		iCurrentPos;	
+
+	uint8_t		*pBsStackBufPtr;	// current writing position	
+	uint32_t    uiBsStackCurBits;  
+	int32_t		iBsStackLeftBits;
+
+	int32_t		iMbSkipRunStack;
+} SDynamicSlicingStack;
+
+/*!
+ * \brief	Initialize Wels SSlice context (Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context to be initialized
+ * \param	bFmoUseFlag	flag of using fmo
+ * \param	iMbWidth		MB width 
+ * \param	iMbHeight		MB height
+ * \param	uiSliceMode		slice mode
+ * \param	mul_slice_arg	argument for multiple slice if it is applicable
+ * \param	pPpsArg			argument for pPps parameter
+ *
+ * \return	0 - successful; none 0 - failed;
+ */
+int32_t InitSlicePEncCtx( SSliceCtx *pSliceCtx,
+						    CMemoryAlign *pMa,
+						    bool_t bFmoUseFlag,
+							int32_t iMbWidth,
+							int32_t iMbHeight,
+							SMulSliceOption *pMulSliceOption,
+							void *pPpsArg );
+
+
+/*!
+ * \brief	Uninitialize Wels SSlice context (Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context to be initialized 
+ *
+ * \return	NONE;
+ */
+void UninitSlicePEncCtx( SSliceCtx *pSliceCtx, CMemoryAlign *pMa );
+
+/*!
+ * \brief	Get slice idc for given iMbXY (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	uiSliceIdc - successful; (uint8_t)(-1) - failed;
+ */
+uint8_t WelsMbToSliceIdc( SSliceCtx *pSliceCtx, const int16_t kiMbXY );
+
+/*!
+ * \brief	Get first mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiSliceIdc		slice idc
+ *
+ * \return	first_mb - successful; -1 - failed;
+ */
+int32_t WelsGetFirstMbOfSlice( SSliceCtx *pSliceCtx, const int32_t kiSliceIdc );
+
+/*!
+ * \brief	Get successive mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	next_mb - successful; -1 - failed;
+ */
+int32_t WelsGetNextMbOfSlice( SSliceCtx *pSliceCtx, const int16_t kiMbXY );
+
+/*!
+ * \brief	Get previous mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	prev_mb - successful; -1 - failed;
+ */
+int32_t WelsGetPrevMbOfSlice( SSliceCtx *pSliceCtx, const int16_t kiMbXY );
+
+/*!
+ * \brief	Get number of mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiSliceIdc		slice/slice_group idc
+ *
+ * \return	count_num_of_mb - successful; -1 - failed;
+ */
+int32_t WelsGetNumMbInSlice( SSliceCtx *pSliceCtx, const int32_t kiSliceIdc );
+
+/*!
+ *	Get slice count for multiple slice segment
+ *
+ */
+int32_t GetInitialSliceNum( const int32_t kiMbWidth, const int32_t kiMbHeight, SMulSliceOption* pMso );
+int32_t GetCurrentSliceNum( const SSliceCtx *kpSliceCtx );
+
+//checking valid para
+int32_t DynamicMaxSliceNumConstraint( uint32_t uiMaximumNum, int32_t uiConsumedNum, uint32_t uiDulplicateTimes  );
+
+bool_t CheckFixedSliceNumMultiSliceSetting( const int32_t kiMbNumInFrame,  SSliceArgument * pSliceArg );
+bool_t CheckRasterMultiSliceSetting( const int32_t kiMbNumInFrame, SSliceArgument * pSliceArg );
+bool_t CheckRowMbMultiSliceSetting( const int32_t kiMbWidth,  SSliceArgument * pSliceArg );
+
+void GomValidCheckSliceNum( const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t *pSliceNum );
+void GomValidCheckSliceMbNum( const int32_t kiMbWidth, const int32_t kiMbHeight,  SSliceArgument * pSliceArg );
+//end of checking valid para
+
+int32_t DynamicAdjustSlicePEncCtxAll(	SSliceCtx *pSliceCtx,
+											int32_t *pRunLength	);
+}
+#endif//WELS_SLICE_SEGMENT_H__
--- /dev/null
+++ b/codec/encoder/core/inc/svc_encode_mb.h
@@ -1,0 +1,64 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	encode_mb.h
+ *
+ * \brief	interface for mb encoding
+ *
+ * \date	5/21/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(ENCODE_MB_H)
+#define ENCODE_MB_H
+
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "slice.h"
+#include "bit_stream.h"
+#include "encoder_context.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+void	WelsDctMb(int16_t* pRs, uint8_t* pEncMb, int32_t iEncStride, uint8_t* pBestPred, PDctFunc pfDctFourT4);
+
+void	WelsEncRecI16x16Y(sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache);
+void	WelsEncRecI4x4Y( sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache, uint8_t uiI4x4Idx);
+void	WelsEncInterY(SWelsFuncPtrList *func, SMB * pCurMb, SMbCache *pMbCache);
+void    WelsEncRecUV(SWelsFuncPtrList *func, SMB * pCurMb, SMbCache *pMbCache, int16_t * pRs, int32_t iUV);
+void    WelsRecPskip(SDqLayer *pCurDq, SWelsFuncPtrList *pFunc, SMB * pCurMb, SMbCache *pMbCache);
+
+BOOL_T	WelsTryPYskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache);
+BOOL_T    WelsTryPUVskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache, int32_t iUV);
+}
+#endif
+
--- /dev/null
+++ b/codec/encoder/core/inc/svc_encode_slice.h
@@ -1,0 +1,100 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_encode_slice.h
+ *
+ * \brief	svc encoding slice 
+ *
+ * \date	2009.07.27 Created
+ *
+ *************************************************************************************
+ */
+#ifndef SVC_ENCODE_SLICE_H__
+#define SVC_ENCODE_SLICE_H__
+
+#include "encoder_context.h"
+#include "as264_common.h"
+#include "svc_enc_macroblock.h"
+#include "mb_cache.h"
+
+namespace WelsSVCEnc {
+#if defined(MB_TYPES_CHECK)
+void WelsCountMbType(int32_t (*iMbCount)[18], const EWelsSliceType eSt, const SMB* pMb);
+#endif
+
+
+void UpdateNonZeroCountCache(SMB *pMb, SMbCache *pMbCache);
+
+//for P SSlice (intra part + inter part, MB level)
+void OutputPMbWithoutConstructCsRsNoCopy( sWelsEncCtx *pEncCtx, SDqLayer* pDq, SSlice *pSlice, SMB* pMb );
+
+void WelsSliceHeaderScalExtInit( SDqLayer* pCurLayer, SSlice *pSlice );
+void WelsSliceHeaderExtInit( sWelsEncCtx* pEncCtx, SDqLayer* pCurLayer, SSlice *pSlice );
+
+void WelsSliceHeaderWrite( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, uint32_t uiPpsIdBasis );
+void WelsSliceHeaderExtWrite( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, uint32_t uiPpsIdBasis );
+
+//===================MB-leve encode====================//
+void WelsInterMbEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb );//only for inter part
+//for I SSlice (only intra part, MB level)
+void WelsIMbChromaEncode( sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache *pMbCache );
+//for P SSlice (intra part + inter part, MB level)
+void WelsPMbChromaEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb );
+
+
+//===================MB-level encode====================//
+//encapsulation func: store base rec, highest Dependency Layer(only one quality) rec, single layer rec
+void WelsPSliceMdEnc( sWelsEncCtx* pEncCtx, SSlice *pSlice,  const bool_t kbIsHighestDlayerFlag );
+void WelsPSliceMdEncDynamic( sWelsEncCtx* pEncCtx, SSlice *pSlice,  const bool_t kbIsHighestDlayerFlag );
+
+//encapsulation func: store base rec, highest Dependency Layer(only one quality) rec, single layer rec
+void WelsISliceMdEnc( sWelsEncCtx* pEncCtx, SSlice *pSlice );	// for intra non-dynamic slice
+void WelsISliceMdEncDynamic( sWelsEncCtx* pEncCtx, SSlice *pSlice );	// for intra dynamic slice
+
+void WelsCodePSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice );
+void WelsCodePOverDynamicSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice );
+
+void WelsCodeOneSlice( sWelsEncCtx* pEncCtx, const int32_t kiSliceIdx, const int32_t/*EWelsNalUnitType*/ keNalType/*, bool_t bNewLayer*/ );
+
+void WelsInitSliceEncodingFuncs( uint32_t uiCpuFlag );
+
+void UpdateMbNeighbourInfoForNextSlice(	SSliceCtx *pSliceCtx,
+											 SMB *pMbList,
+											 const int32_t kiNextSliceFirstMbIdx,
+											 const int32_t kiLastMbIdxInPartition );
+void AddSliceBoundary(sWelsEncCtx* pEncCtx, SSlice * pCurSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, int32_t iNextSliceFirstMbIdx, const int32_t kiLastMbIdxInPartition );
+void WelsMdInterMbLoop( sWelsEncCtx* pEncCtx, SSlice *pSlice, void* pMd, const int32_t kiSliceFirstMbXY );	// for inter non-dynamic slice
+void WelsMdInterMbLoopOverDynamicSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice, void* pMd, const int32_t kiSliceFirstMbXY );	// for inter dynamic slice
+
+
+BOOL_T DynSlcJudgeSliceBoundaryStepBack(void *pEncCtx, void *pSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, SDynamicSlicingStack* pDss );
+}
+#endif //SVC_ENCODE_SLICE_H__
--- /dev/null
+++ b/codec/encoder/core/inc/svc_mode_decision.h
@@ -1,0 +1,61 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_mode_decision.h
+ *
+ * \brief	SVC Spatial Enhancement Layer MD
+ *
+ * \date	2009.7.29 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef SVC_MODE_DECISION_H
+#define SVC_MODE_DECISION_H
+#include "encoder_context.h"
+#include "svc_enc_macroblock.h"
+#include "md.h"
+
+
+namespace WelsSVCEnc {
+////////////////////////
+// INTERFACE, called by svc_encode_slice.c
+///////////////////////
+
+// NOILP ILFMD ENTRANCE
+void WelsMdSpatialelInterMbIlfmdNoilp( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, const Mb_Type kuiRefMbType);
+void WelsMdInterMbEnhancelayer( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache );
+
+SMB* GetRefMb( SDqLayer *pCurLayer, SMB *pCurMb );
+void SetMvBaseEnhancelayer( SWelsMD* pMd, SMB *pCurMb, const SMB *kpRefMb );
+}
+#endif //SVC_MODE_DECISION_H
+
--- /dev/null
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -1,0 +1,131 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc motion estimate.h
+ *
+ * \brief	Interfaces introduced in svc mb motion estimation
+ *
+ * \date	08/11/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef SVC_MOTION_ESTIMATE_
+#define SVC_MOTION_ESTIMATE_
+
+#include "typedefs.h"
+#include "encoder_context.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+#define MV_RANGE (64)
+#define	ITERATIVE_TIMES	(16)
+#define	BASE_MV_MB_NMB	((2*(MV_RANGE+ITERATIVE_TIMES)/MB_WIDTH_LUMA)-1)
+
+union SadPredISatdUnit{
+	uint16_t	uiSadPred;
+	uint16_t	uiSatd;    //reuse the sad_pred as a temp satd pData 
+};
+typedef struct TagWelsME {
+    /* input */
+	uint16_t					*pMvdCost;
+    union SadPredISatdUnit	uSadPredISatd; //reuse the sad_pred as a temp pData
+	uint16_t					uiSadCost;  //used by ME and RC 
+    uint16_t					uiSatdCost; /* satd + lm * nbits */
+    uint8_t						uiPixel;   /* PIXEL_WxH */
+    uint8_t						uiReserved;
+	
+    uint8_t						*pEncMb;
+    uint8_t						*pRefMb;
+
+	SMVUnitXY					sMvp;
+	SMVUnitXY					sMvBase;
+	/* output */
+    SMVUnitXY					sMv;
+}SWelsME;
+
+#define  COST_MVD(table, mx, my)  (table[mx] + table[my])
+
+
+
+/*!
+ * \brief	BL mb motion estimate search
+ *
+ * \param	enc			Wels encoder context
+ * \param	m	        Wels me information
+ *
+ * \return	NONE
+ */
+void WelsMotionEstimateSearchSatd(SWelsFuncPtrList *pFuncList, void* pLplayer, void* pLpme, void* pLpslice );
+
+void WelsMotionEstimateSearchSad(SWelsFuncPtrList *pFuncList, void* pLplayer, void* pLpme, void* pLpslice );
+
+
+
+/*!
+ * \brief	BL mb motion estimate initial point testing
+ *
+ * \param	enc			Wels encoder context
+ * \param	m	        Wels me information
+ * \param	mv_range	search range in motion estimate
+ * \param	point	    the best match point in motion estimation
+ *
+ * \return	NONE
+ */
+
+
+/*!
+ * \brief	EL mb motion estimate initial point testing
+ *
+ * \param	pix_func	SSampleDealingFunc
+ * \param	m	        Wels me information
+ * \param	mv_range	search range in motion estimate
+ * \param	point	    the best match point in motion estimation
+ *
+ * \return	NONE
+ */
+
+void WelsMotionEstimateInitialPoint(SWelsFuncPtrList *pFuncList, SWelsME *pMe, SSlice *pSlice, const int32_t kiStrideEnc, const int32_t kiStrideRef );
+
+/*!
+ * \brief	mb iterative motion estimate search
+ *
+ * \param	enc			Wels encoder context
+ * \param	m	        Wels me information
+ * \param	point	    the best match point in motion estimation
+ *
+ * \return	NONE
+ */
+void WelsMotionEstimateIterativeSearch( SWelsFuncPtrList *pFuncList, SWelsME *pMe, const int32_t kiStrideEnc, const int32_t kiStrideRef, uint8_t *pRef );
+
+bool_t WelsMeSadCostSelect( int32_t *pSadCost, const uint16_t *kpMvdCost, int32_t *pBestCost, const int32_t kiDx, const int32_t kiDy, int32_t *pIx, int32_t *pIy);
+
+}
+#endif
--- /dev/null
+++ b/codec/encoder/core/inc/svc_set_mb_syn_cavlc.h
@@ -1,0 +1,66 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_set_mb_syn_cavlc.h
+ *
+ * \brief	Seting all syntax elements of mb and decoding residual with cavlc
+ *
+ * \date	2009.8.12 Created 
+ *
+ *************************************************************************************
+ */
+ 
+#ifndef SVC_SET_MB_SYN_CAVLC_H_
+#define SVC_SET_MB_SYN_CAVLC_H_
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "encoder_context.h"
+#include "md.h"
+
+#include "set_mb_syn_cavlc.h"
+
+namespace WelsSVCEnc {
+//#pragma pack(1)
+
+void WelsWriteMbResidual( SMbCache* sMbCacheInfo, SMB *pCurMb, SBitStringAux *pBs );
+
+//for Enhance Layer CAVLC writing
+void WelsSpatialWriteSubMbPred( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb );
+
+void WelsSpatialWriteMbPred( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb );
+
+//for Base Layer CAVLC writing
+void WelsSpatialWriteMbSyn( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb );
+
+//#pragma pack()
+}
+#endif
--- /dev/null
+++ b/codec/encoder/core/inc/trace.h
@@ -1,0 +1,68 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef TRACE_H_
+#define TRACE_H_
+
+#include "typedefs.h"
+#include "bit_stream.h"
+#include <stdio.h>
+
+
+#define TRACE_MB_ID(fp, value) \
+			{\
+			fprintf(fp, "----------MB index:	%d ----------\n", value);\
+			fflush(fp);\
+			}
+#define TRACE_FRAME_ID(fp, value) \
+			{\
+			fprintf(fp, "----------Frame index:	%d ----------\n", value);\
+			fflush(fp);\
+			}
+
+#define TRACE_VALUE(fp, value) \
+			{\
+				fprintf(fp, "(%d)\n", value);\
+				fflush(fp);\
+			}
+#define TRACE_VALUE_2(fp, value1, value2) \
+			{\
+			fprintf(fp, "(%d,%d)\n", value1, value2);\
+			fflush(fp);\
+			}
+
+void TraceName(FILE *pFp, int8_t *pName, SBitStringAux *pBs);
+
+void TraceBits(FILE *pFp, uint32_t uiStart, uint32_t uiEnd, SBitStringAux *pBs);
+
+
+#endif
--- /dev/null
+++ b/codec/encoder/core/inc/typedefs.h
@@ -1,0 +1,88 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// typedef.h
+#ifndef WELS_TYPE_DEFINES_H__
+#define WELS_TYPE_DEFINES_H__
+
+#include <limits.h>
+
+////////////////////////////////////////////////////////////////////////////
+// NOTICE : ALL internal implement MUST use the pData type defined as below
+//          ONLY except with the interface file !!!!!
+////////////////////////////////////////////////////////////////////////////
+
+#ifndef  _MSC_VER
+
+#include <stdint.h>
+
+#else
+
+// FIXME:     all singed type should be declared explicit,  for example,  int8_t should be declared as signed char.  
+typedef signed char      int8_t  ;
+typedef unsigned char    uint8_t ;
+typedef short            int16_t ;
+typedef unsigned short   uint16_t;
+typedef int              int32_t ;
+typedef unsigned int     uint32_t;
+typedef __int64          int64_t ;
+typedef unsigned __int64 uint64_t;
+
+#endif // _MSC_VER defined
+
+// FIXME:     all string type should be declared explicit as char. 
+typedef char      str_t;
+typedef float     real32_t;
+
+#ifdef EPSN
+#undef EPSN
+#endif//EPSN
+#define EPSN	  (0.000001f) // (1e-6)	// desired float precision
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+
+typedef bool bool_t;
+typedef int32_t BOOL_T;
+
+#ifndef FALSE
+#define FALSE   ((int32_t)0)
+#endif//FALSE
+
+#ifndef TRUE
+#define TRUE    ((int32_t)1)
+#endif//TRUE
+
+
+#endif //WELS_TYPE_DEFINES_H__
--- /dev/null
+++ b/codec/encoder/core/inc/utils.h
@@ -1,0 +1,186 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \brief	Tool kits for decoder
+ *		( malloc, realloc, free, log output and PSNR calculation and so on )
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_UTILS_H__
+#define WELS_UTILS_H__
+
+#include <stdarg.h>
+#include "typedefs.h"
+
+namespace WelsSVCEnc {
+
+
+/*
+ *	Log output routines
+ */
+
+typedef int32_t	iWelsLogLevel;
+enum{
+	WELS_LOG_QUIET		= 0x00,		// Quiet mode
+	WELS_LOG_ERROR		= 1 << 0,	// Error log iLevel
+	WELS_LOG_WARNING	= 1 << 1,	// Warning log iLevel
+	WELS_LOG_INFO		= 1 << 2,	// Information log iLevel
+	WELS_LOG_DEBUG		= 1 << 3,	// Debug log iLevel
+	WELS_LOG_RESV		= 1 << 4,	// Resversed log iLevel
+	WELS_LOG_LEVEL_COUNT= 5,
+	WELS_LOG_DEFAULT	= WELS_LOG_ERROR | WELS_LOG_WARNING | WELS_LOG_INFO | WELS_LOG_DEBUG	// Default log iLevel in Wels codec
+};
+
+/*
+ *	Function pointer declaration for various tool sets
+ */
+// wels log output
+typedef void (*PWelsLogCallbackFunc)(void *pCtx, const int32_t iLevel, const str_t *kpFmt, va_list argv);
+
+// wels psnr calc
+typedef real32_t (*PWelsPsnrFunc)(	const void *kpTarPic,
+										const int32_t kiTarStride,
+										const void *kpRefPic,
+										const int32_t kiRefStride,
+										const int32_t kiWidth,
+										const int32_t kiHeight	);
+
+extern PWelsLogCallbackFunc	wlog;
+
+#ifdef __GNUC__
+extern void WelsLog(void *pCtx, int32_t iLevel, const str_t *kpFmt, ...) __attribute__ ((__format__ (__printf__, 3, 4)));
+#else
+extern void WelsLog(void *pCtx, int32_t iLevel, const str_t *kpFmt, ...);
+#endif
+
+extern const str_t *g_sWelsLogTags[];
+
+/*! 
+ *************************************************************************************
+ * \brief	System trace log output in Wels
+ *
+ * \param	pCtx	instance pointer
+ * \param	kiLevel	log iLevel ( WELS_LOG_QUIET, ERROR, WARNING, INFO, DEBUG )
+ * \param	kpFmtStr	formated string to mount
+ * \param 	argv	pData string argument
+ *
+ * \return	NONE
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+void WelsLogDefault( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv );
+void WelsLogNil( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv );
+
+
+/*! 
+ *************************************************************************************
+ * \brief	set log iLevel from external call
+ *
+ * \param	iLevel	iLevel of log 
+ *
+ * \return	NONE
+ *
+ * \note	can be able to control log iLevel dynamically
+ *************************************************************************************
+ */
+void WelsSetLogLevel( const int32_t kiLevel );
+
+/*! 
+ *************************************************************************************
+ * \brief	get log iLevel from external call
+ *
+ * \param	N/A
+ *
+ * \return	current iLevel of log used in codec internal
+ *
+ * \note	can be able to get log iLevel of internal codec applicable
+ *************************************************************************************
+ */
+int32_t WelsGetLogLevel( void );
+
+/*! 
+ *************************************************************************************
+ * \brief	set log callback from external call
+ *
+ * \param	_log	log function routine
+ *
+ * \return	NONE
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+void WelsSetLogCallback( PWelsLogCallbackFunc _log );
+
+/*! 
+*************************************************************************************
+* \brief	reopen log file when finish setting current path
+*
+* \param	pCtx		context pCtx
+* \param	pCurPath	current path string
+*
+* \return	NONE
+*
+* \note	N/A
+*************************************************************************************
+*/
+void WelsReopenTraceFile( void *pCtx, str_t *pCurPath );
+
+/*
+ *	PSNR calculation routines
+ */
+/*! 
+ *************************************************************************************
+ * \brief	PSNR calculation utilization in Wels
+ *
+ * \param	kpTarPic		target picture to be calculated in Picture pData format
+ * \param	kiTarStride	stride of target picture pData pBuffer
+ * \param 	kpRefPic		base referencing picture samples
+ * \param	kiRefStride	stride of reference picture pData pBuffer
+ * \param	kiWidth		picture iWidth in pixel
+ * \param	kiHeight		picture iHeight in pixel
+ *
+ * \return	actual PSNR result;
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+real32_t WelsCalcPsnr(	const void *kpTarPic,
+							const int32_t kiTarStride,
+							const void *kpRefPic,
+							const int32_t kiRefStride,
+							const int32_t kiWidth,
+							const int32_t kiHeight );
+
+}
+#endif//WELS_UTILS_H__
--- /dev/null
+++ b/codec/encoder/core/inc/vlc_encoder.h
@@ -1,0 +1,97 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_VLC_ENCODER_H__
+#define WELS_VLC_ENCODER_H__
+
+#include "bit_stream.h"
+#include "svc_enc_golomb.h"
+
+/************************************************************************/
+/* VLC FOR WELS ENCODER                                                 */
+/************************************************************************/
+
+namespace WelsSVCEnc {
+
+//g_kuiVlcCoeffToken[uiNc][total-coeff][trailing-ones][0--value, 1--bit count]
+extern const uint8_t g_kuiVlcCoeffToken[5][17][4][2];
+extern const uint8_t g_kuiVlcLevelPrefix[15][2];
+//g_kuiVlcTotalZeros[tzVlcIndex][uiTotalZeros][0--value, 1--bit count]
+extern const uint8_t g_kuiVlcTotalZeros[16][16][2];
+extern const uint8_t g_kuiVlcTotalZerosChromaDc[4][4][2];
+//add for mgs
+extern const uint8_t g_kuiVlcTotalZerosChromaDc422[8][8][2];
+//g_kuiVlcRunBefore[zeros-left][run-before][0--value, 1--bit count]
+extern const uint8_t g_kuiVlcRunBefore[8][15][2];
+extern const ALIGNED_DECLARE(uint8_t, g_kuiEncNcMapTable[18], 16);
+
+#define    CHROMA_DC_NC_OFFSET       17
+
+static inline int32_t WriteTotalCoeffTrailingones( SBitStringAux *pBs, uint8_t uiNc, uint8_t uiTotalCoeff, uint8_t uiTrailingOnes )
+{
+	const uint8_t kuiNcIdx		= g_kuiEncNcMapTable[uiNc];
+	const uint8_t *kpCoeffToken	= &g_kuiVlcCoeffToken[kuiNcIdx][uiTotalCoeff][uiTrailingOnes][0];	
+	return BsWriteBits( pBs,  kpCoeffToken[1], kpCoeffToken[0] );	
+}
+
+static inline int32_t WriteTotalcoeffTrailingonesChroma( SBitStringAux *pBs, uint8_t uiTotalCoeff, uint8_t uiTrailingOnes )
+{
+	const uint8_t *kpCoeffToken	= &g_kuiVlcCoeffToken[4][uiTotalCoeff][uiTrailingOnes][0];
+	return BsWriteBits( pBs, kpCoeffToken[1], kpCoeffToken[0] );	
+}
+
+//kuiZeroCount = level_prefix;
+static inline int32_t WriteLevelPrefix( SBitStringAux *pBs, const uint32_t kuiZeroCount )
+{	
+	BsWriteBits(pBs, kuiZeroCount+1, 1);
+	return 0;
+}
+
+static inline int32_t WriteTotalZeros( SBitStringAux *pBs, uint32_t uiTotalCoeff, uint32_t uiTotalZeros )
+{
+	const uint8_t *kpTotalZeros	= &g_kuiVlcTotalZeros[uiTotalCoeff][uiTotalZeros][0];
+	return BsWriteBits( pBs, kpTotalZeros[1], kpTotalZeros[0] );	
+}
+
+static inline int32_t WriteTotalZerosChromaDc( SBitStringAux *pBs, uint32_t uiTotalCoeff, uint32_t uiTotalZeros )
+{
+	const uint8_t *kpTotalZerosChromaDc = &g_kuiVlcTotalZerosChromaDc[uiTotalCoeff][uiTotalZeros][0];
+	return BsWriteBits( pBs, kpTotalZerosChromaDc[1], kpTotalZerosChromaDc[0] );	
+}
+
+static inline int32_t WriteRunBefore( SBitStringAux *pBs, uint8_t uiZeroLeft, uint8_t uiRunBefore )
+{
+	const uint8_t *kpRunBefore = &g_kuiVlcRunBefore[uiZeroLeft][uiRunBefore][0];
+	return BsWriteBits( pBs, kpRunBefore[1], kpRunBefore[0] );
+}
+}
+#endif
--- /dev/null
+++ b/codec/encoder/core/inc/wels_common_basis.h
@@ -1,0 +1,399 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//wels_common_basis.h
+#ifndef WELS_COMMON_BASIS_H__
+#define WELS_COMMON_BASIS_H__
+
+#include "typedefs.h"
+#include "macros.h"
+
+#include "wels_const.h" 
+
+
+namespace WelsSVCEnc {
+/*common use table*/
+
+extern const  ALIGNED_DECLARE(uint16_t, g_kuiDequantCoeff[52][8], 16);
+extern const uint8_t g_kuiChromaQpTable[52];
+
+/* Profile IDC */
+
+enum EProfileIdc{
+	PRO_BASELINE	= 66,
+	PRO_MAIN		= 77,
+	PRO_EXTENDED	= 88,
+	PRO_HIGH		= 100,
+	PRO_HIGH10		= 110,
+	PRO_HIGH422		= 122,
+	PRO_HIGH444		= 144,
+	PRO_CAVLC444	= 244,
+	
+	PRO_SCALABLE_BASELINE	= 83,
+	PRO_SCALABLE_HIGH		= 86,
+};
+
+/*
+ *	NAL Unit Type (5 Bits)
+ */
+enum EWelsNalUnitType
+{
+	NAL_UNIT_UNSPEC_0			= 0,
+	NAL_UNIT_CODED_SLICE		= 1,
+	NAL_UNIT_CODED_SLICE_DPA	= 2,
+	NAL_UNIT_CODED_SLICE_DPB	= 3,
+	NAL_UNIT_CODED_SLICE_DPC	= 4,
+	NAL_UNIT_CODED_SLICE_IDR	= 5,
+	NAL_UNIT_SEI				= 6,
+	NAL_UNIT_SPS				= 7,
+	NAL_UNIT_PPS				= 8,
+	NAL_UNIT_AU_DELIMITER		= 9,
+	NAL_UNIT_END_OF_SEQ			= 10,
+	NAL_UNIT_END_OF_STR			= 11,
+	NAL_UNIT_FILLER_DATA		= 12,
+	NAL_UNIT_SPS_EXT			= 13,
+	NAL_UNIT_PREFIX				= 14,
+	NAL_UNIT_SUBSET_SPS			= 15,
+	NAL_UNIT_RESV_16			= 16,
+	NAL_UNIT_RESV_17			= 17,
+	NAL_UNIT_RESV_18			= 18,
+	NAL_UNIT_AUX_CODED_SLICE	= 19,
+	NAL_UNIT_CODED_SLICE_EXT	= 20,
+	NAL_UNIT_RESV_21			= 21,
+	NAL_UNIT_RESV_22			= 22,
+	NAL_UNIT_RESV_23			= 23,
+	NAL_UNIT_UNSPEC_24			= 24,
+	NAL_UNIT_UNSPEC_25			= 25,
+	NAL_UNIT_UNSPEC_26			= 26,
+	NAL_UNIT_UNSPEC_27			= 27,
+	NAL_UNIT_UNSPEC_28			= 28,
+	NAL_UNIT_UNSPEC_29			= 29,
+	NAL_UNIT_UNSPEC_30			= 30,
+	NAL_UNIT_UNSPEC_31			= 31
+};
+
+/*
+ *	NAL Reference IDC (2 Bits)
+ */
+
+enum EWelsNalRefIdc
+{
+	NRI_PRI_LOWEST	= 0,
+	NRI_PRI_LOW		= 1,
+	NRI_PRI_HIGH	= 2,
+	NRI_PRI_HIGHEST	= 3
+};
+
+/*
+ * VCL TYPE	
+ */
+
+enum EVclType{
+	NON_VCL			= 0,
+	VCL				= 1,
+	NOT_APP			= 2
+};
+
+/*
+ *	vcl type map for given NAL unit type and corresponding H264 type (0: AVC; 1: SVC).
+ */
+extern const EVclType g_keTypeMap[32][2];
+
+#define IS_VCL_NAL(t, ext_idx)			(g_keTypeMap[t][ext_idx] == VCL)
+#define IS_PARAM_SETS_NALS(t)			( (t) == NAL_UNIT_SPS || (t) == NAL_UNIT_PPS || (t) == NAL_UNIT_SUBSET_SPS )
+#define IS_SPS_NAL(t)					( (t) == NAL_UNIT_SPS )
+#define IS_SUBSET_SPS_NAL(t)			( (t) == NAL_UNIT_SUBSET_SPS )
+#define IS_PPS_NAL(t)					( (t) == NAL_UNIT_PPS )
+#define IS_SEI_NAL(t)					( (t) == NAL_UNIT_SEI )
+#define IS_PREFIX_NAL(t)				( (t) == NAL_UNIT_PREFIX )
+#define IS_SUBSET_SPS_USED(t)			( (t) == NAL_UNIT_SUBSET_SPS || (t) == NAL_UNIT_CODED_SLICE_EXT )
+#define IS_VCL_NAL_AVC_BASE(t)			( (t) == NAL_UNIT_CODED_SLICE || (t) == NAL_UNIT_CODED_SLICE_IDR )
+#define IS_NEW_INTRODUCED_SVC_NAL(t)	( (t) == NAL_UNIT_PREFIX || (t) == NAL_UNIT_CODED_SLICE_EXT )
+
+/*
+ *	Frame types used in internal encoder (logic level based)
+ */
+enum EFrameType{
+	WELS_FRAME_TYPE_AUTO	= 0x0000,	/* Let encoder engine choose the proper type, RDO or scene change based */
+	WELS_FRAME_TYPE_IDR		= 0x0001,	/* IDR, I frame with parameter sets */
+	WELS_FRAME_TYPE_I		= 0x0002,	/* I Frame */
+	WELS_FRAME_TYPE_P		= 0x0003,	/* P Frame */
+	WELS_FRAME_TYPE_B		= 0x0004,	/* B Frame */
+	WELS_FRAME_TYPE_SKIP	= 0x0008
+};
+
+/* Base SSlice Types
+ * Invalid in case of eSliceType exceeds 9,
+ * Need trim when eSliceType > 4 as fixed SliceType(eSliceType-4),
+ * meaning mapped version after eSliceType minus 4.
+ */
+
+enum EWelsSliceType
+{
+	P_SLICE	= 0,
+	B_SLICE	= 1,
+	I_SLICE	= 2,
+	SP_SLICE= 3,
+	SI_SLICE= 4,
+	UNKNOWN_SLICE= 5
+};
+
+/* SSlice Types in scalable extension */		;
+enum ESliceTypeExt{
+	EP_SLICE = 0,	// EP_SLICE: 0, 5
+	EB_SLICE = 1,	// EB_SLICE: 1, 6
+	EI_SLICE = 2	// EI_SLICE: 2, 7
+};
+
+/* List Index */
+enum EListIndex{
+	LIST_0	= 0,
+	LIST_1	= 1,
+	LIST_A	= 2
+};
+
+
+struct SMVUnitXY{			// each 4 Bytes
+    int16_t		iMvX;
+    int16_t		iMvY;
+public:	
+	SMVUnitXY& sDeltaMv ( const SMVUnitXY& _v0, const SMVUnitXY& _v1 )
+	{
+		iMvX = _v0.iMvX - _v1.iMvX;
+		iMvY = _v0.iMvY - _v1.iMvY;
+		return (*this);
+	}
+};
+
+typedef struct TagMVComponentUnit{		// each 	LIST_0/LIST_1
+	SMVUnitXY	sMotionVectorCache[5*6-1];			// Luma only: 5 x 6 - 1 = 29 D-Words
+	int8_t		iRefIndexCache[5 * 6];			// Luma only: 5 x 6 = 30 bytes
+}SMVComponentUnit, *PMVComponentUnit;
+
+
+typedef struct TagParaSetOffsetVariable{	
+	int32_t 	iParaSetIdDelta[MAX_DQ_LAYER_NUM/*+1*/];	//mark delta between SPS_ID_in_bs and sps_id_in_encoder, can be minus, for each dq-layer
+															//need not extra +1 due no MGS and FMO case so far
+	bool_t		bUsedParaSetIdInBs[MAX_PPS_COUNT];	//mark the used SPS_ID with 1
+	uint32_t	uiNextParaSetIdToUseInBs;					//mark the next SPS_ID_in_bs, for all layers
+}SParaSetOffsetVariable;
+
+typedef struct TagParaSetOffset{
+	//in PS0 design, "sParaSetOffsetVariable" record the previous paras before current IDR, AND NEED to be stacked and recover across IDR
+	SParaSetOffsetVariable   sParaSetOffsetVariable[PARA_SET_TYPE]; //PARA_SET_TYPE=3; paraset_type = 0: AVC_SPS; =1: Subset_SPS; =2: PPS	
+	//in PSO design, "bPpsIdMappingIntoSubsetsps" uses the current para of current IDR period
+	bool_t                  bPpsIdMappingIntoSubsetsps[MAX_DQ_LAYER_NUM/*+1*/];	// need not extra +1 due no MGS and FMO case so far
+	uint16_t	            uiIdrPicId;		// IDR picture id: [0, 65535], this one is used for LTR!! Can we just NOT put this into the SParaSetOffset structure?!!
+#if _DEBUG 
+	bool_t                  bEnableSpsPpsIdAddition;
+#endif
+}SParaSetOffset;
+
+
+
+/* Motion Vector components */
+enum EMvComp{
+	MV_X	= 0,
+	MV_Y	= 1,
+	MV_A	= 2
+};
+
+/* Chroma Components */
+
+enum EChromaComp{
+	CHROMA_CB	= 0,
+	CHROMA_CR	= 1,
+	CHROMA_A	= 2
+};
+
+/* Position Offset structure */
+typedef struct TagCropOffset{
+	int16_t	iCropLeft;
+    int16_t	iCropRight;
+	int16_t	iCropTop;
+	int16_t	iCropBottom;
+}SCropOffset;
+
+
+/* Transform Type */
+
+enum ETransType{
+	T_4x4	= 0,
+	T_8x8	= 1,
+	T_16x16	= 2,
+	T_PCM	= 3
+};
+
+enum EMbPosition 
+{
+    LEFT_MB_POS     = 0x01,	// A
+    TOP_MB_POS      = 0x02,	// B
+    TOPRIGHT_MB_POS = 0x04,	// C
+	TOPLEFT_MB_POS	= 0x08,	// D,
+	RIGHT_MB_POS	= 0x10,	//  add followed four case to reuse when intra up-sample
+	BOTTOM_MB_POS	= 0x20,	// 
+	BOTTOMRIGHT_MB_POS = 0x40,	// 
+	BOTTOMLEFT_MB_POS	= 0x80,	//
+	MB_POS_A  = 0x100
+};
+#define MB_ON_PIC_BOUNDRY			(RIGHT_MB_POS|BOTTOM_MB_POS|LEFT_MB_POS|TOP_MB_POS)
+
+/* MB Type & Sub-MB Type */
+typedef uint32_t Mb_Type;
+
+#define	MB_LEFT_BIT			0// add to use in intra up-sample
+#define	MB_TOP_BIT			1
+#define	MB_TOPRIGHT_BIT		2
+#define	MB_TOPLEFT_BIT		3
+#define	MB_RIGHT_BIT		4
+#define	MB_BOTTOM_BIT		5
+#define	MB_BTMRIGHT_BIT		6
+#define	MB_BTMLEFT_BIT		7
+
+
+/* AVC types*/
+#define MB_TYPE_INTRA4x4		0x00000001
+#define MB_TYPE_INTRA16x16		0x00000002
+#define MB_TYPE_INTRA_PCM		0x00000004
+#define MB_TYPE_16x16			0x00000008
+#define MB_TYPE_16x8			0x00000010
+#define MB_TYPE_8x16			0x00000020
+#define MB_TYPE_8x8				0x00000040
+#define MB_TYPE_8x8_REF0		0x00000080
+
+#define MB_TYPE_SKIP			0x00000100
+#define MB_TYPE_P0L0			0x00000200
+#define MB_TYPE_P1L0			0x00000400
+#define MB_TYPE_P0L1			0x00000800
+#define MB_TYPE_P1L1			0x00001000
+#define MB_TYPE_L0				(MB_TYPE_P0L0 | MB_TYPE_P1L0)
+#define MB_TYPE_L1				(MB_TYPE_P0L1 | MB_TYPE_P1L1)
+#define MB_TYPE_L0L1			(MB_TYPE_L0   | MB_TYPE_L1)
+#define MB_TYPE_QUANT			0x00002000
+#define MB_TYPE_CBP				0x00004000
+/* SVC extension types */
+#define MB_TYPE_INTRA_BL		0x00008000// I_BL new MB type derived H.264 SVC specific
+
+#define MB_TYPE_BACKGROUND		0x00010000  // conditional BG skip_mb
+
+
+#define MB_TYPE_INTRA			(MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA_PCM)
+#define MB_TYPE_INTER			(MB_TYPE_16x16 | MB_TYPE_16x8 | MB_TYPE_8x16 | MB_TYPE_8x8 | MB_TYPE_8x8_REF0)
+#define SUB_TYPE_8x8			(MB_TYPE_8x8 | MB_TYPE_8x8_REF0)
+
+#define MB_TYPE_UNAVAILABLE		0xFF000000
+#define REF_NOT_AVAIL    -2   
+#define REF_NOT_IN_LIST -1    //intra
+#define	REF_PIC_REORDER_DEFAULT	TRUE
+
+#define IS_INTRA4x4(type) ( MB_TYPE_INTRA4x4 == (type) )
+#define IS_INTRA16x16(type) ( MB_TYPE_INTRA16x16 == (type) )
+#define IS_INTRA(type) ((type)&MB_TYPE_INTRA)
+#define IS_INTER(type) ((type)&MB_TYPE_INTER)
+
+#define IS_SKIP(type) ( (type) == MB_TYPE_SKIP )
+#define IS_SVC_INTER(type) ( IS_INTER(type) || IS_SKIP(type) )
+#define IS_I_BL(type) ( (type) == MB_TYPE_INTRA_BL )
+#define IS_SVC_INTRA(type) ( IS_I_BL(type) || IS_INTRA(type) )
+#define IS_SUB8x8(type) ((type)&SUB_TYPE_8x8)
+#define IS_Inter_8x8(type) ( (type) == MB_TYPE_8x8)
+
+
+
+enum{
+	Intra4x4			= 0,
+	Intra16x16			= 1,
+	Inter16x16			= 2,
+	Inter16x8			= 3,
+	Inter8x16			= 4,
+	Inter8x8			= 5,
+	PSkip				= 6
+};
+
+
+/*
+ *	Memory Management Control Operation (MMCO) code
+ */
+enum EMmcoCode{
+	MMCO_END			=0,
+	MMCO_SHORT2UNUSED	=1,
+	MMCO_LONG2UNUSED	=2,
+	MMCO_SHORT2LONG		=3,
+	MMCO_SET_MAX_LONG	=4,
+	MMCO_RESET			=5,
+	MMCO_LONG			=6
+};
+
+/////////intra16x16  Luma
+#define I16_PRED_INVALID   -1
+#define I16_PRED_V       0
+#define I16_PRED_H       1
+#define I16_PRED_DC      2
+#define I16_PRED_P       3
+
+#define I16_PRED_DC_L    4
+#define I16_PRED_DC_T    5
+#define I16_PRED_DC_128  6
+#define I16_PRED_DC_A  7
+//////////intra4x4   Luma
+#define I4_PRED_INVALID    0
+#define I4_PRED_V        0
+#define I4_PRED_H        1
+#define I4_PRED_DC       2
+#define I4_PRED_DDL      3 //diagonal_down_left
+#define I4_PRED_DDR      4 //diagonal_down_right
+#define I4_PRED_VR       5 //vertical_right
+#define I4_PRED_HD       6 //horizon_down
+#define I4_PRED_VL       7 //vertical_left
+#define I4_PRED_HU       8 //horizon_up
+
+#define I4_PRED_DC_L     9
+#define I4_PRED_DC_T     10
+#define I4_PRED_DC_128   11
+
+#define I4_PRED_DDL_TOP  12 //right-top replacing by padding rightmost pixel of top
+#define I4_PRED_VL_TOP   13 //right-top replacing by padding rightmost pixel of top
+#define I4_PRED_A   14
+
+//////////intra Chroma
+#define C_PRED_INVALID   -1
+#define C_PRED_DC        0
+#define C_PRED_H         1
+#define C_PRED_V         2
+#define C_PRED_P         3
+
+#define C_PRED_DC_L      4
+#define C_PRED_DC_T      5
+#define C_PRED_DC_128    6 
+#define C_PRED_A    7 
+}
+#endif//WELS_COMMON_BASIS_H__
--- /dev/null
+++ b/codec/encoder/core/inc/wels_const.h
@@ -1,0 +1,188 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+//wels_const.h
+#ifndef WELS_CONSTANCE_H__
+#define WELS_CONSTANCE_H__
+
+#include "as264_common.h"	//  to communicate with specific macros there, 3/18/2010
+#include "codec_app_def.h"
+
+/* To control number of spatial, quality and temporal layers constraint by application layer? */
+#define NUM_SPATIAL_LAYERS_CONSTRAINT
+#define NUM_QUALITY_LAYERS_CONSTRAINT
+
+
+// Miscellaneous sizing infos
+#ifndef MAX_FNAME_LEN
+#define MAX_FNAME_LEN		256	// maximal length of file name in str_t size
+#endif//MAX_FNAME_LEN
+
+#ifndef WELS_LOG_BUF_SIZE
+#define WELS_LOG_BUF_SIZE	4096
+#endif//WELS_LOG_BUF_SIZE
+
+#ifndef MAX_TRACE_LOG_SIZE
+#define MAX_TRACE_LOG_SIZE	(50 * (1<<20))	// max trace log size: 50 MB, overwrite occur if log file size exceeds this size
+#endif//MAX_TRACE_LOG_SIZE
+
+/* MB width in pixels for specified colorspace I420 usually used in codec */
+#define MB_WIDTH_LUMA		16
+#define MB_WIDTH_CHROMA		(MB_WIDTH_LUMA>>1)
+/* MB height in pixels for specified colorspace I420 usually used in codec */
+#define MB_HEIGHT_LUMA		16
+#define MB_HEIGHT_CHROMA	(MB_HEIGHT_LUMA>>1)
+
+/* Some list size */
+#define MB_COEFF_LIST_SIZE	(256+((MB_WIDTH_CHROMA*MB_HEIGHT_CHROMA)<<1))
+#define MB_REQ_LUMA_CACHE_SIZE	40	// 8x5 Size of MB cache only luma component required to store
+#define MB_REQ_ALL_CACHE_SIZE	48	// 8x6 Size of MB cache all components required to store
+#define MB_LUMA_CACHE_SIZE		26	// 5x5+1
+#define MB_CHROMA_CACHE_SIZE	10	// 3x3+1
+
+#define MB_PARTITION_SIZE		4	// Macroblock partition size in 8x8 sub-blocks
+#define MB_SUB_PARTITION_SIZE	4	// Sub partition size in a 8x8 sub-block
+#define MB_BLOCK4x4_NUM				16
+#define INTRA_4x4_MODE_NUM		8	
+#define MB_BLOCK8x8_NUM				4
+#define MB_LUMA_CHROMA_BLOCK4x4_NUM  24
+
+#define NAL_UNIT_HEADER_SVC_EXT_SIZE	3	// Size of NAL unit header for SVC extension in byte
+
+#define MAX_SPS_COUNT			32	// Count number of SPS
+#define MAX_PPS_COUNT_LIMITED 	57// limit the max ID of PPS because of known limitation of receiver endpoints
+#define MAX_PPS_COUNT 			(MAX_PPS_COUNT_LIMITED)//in Standard is 256	// Count number of PPS
+
+#define PARA_SET_TYPE			3 // SPS+PPS
+#define PARA_SET_TYPE_AVCSPS	0 
+#define PARA_SET_TYPE_SUBSETSPS	1 
+#define PARA_SET_TYPE_PPS		2
+
+#define MAX_FRAME_RATE			30	// maximal frame rate to support
+#define MIN_FRAME_RATE			1	// minimal frame rate need support
+
+#define SVC_QUALITY_BASE_QP		26
+#define SVC_QUALITY_DELTA_QP	(-3)
+
+#define MAX_SLICEGROUP_IDS		8	// Count number of SSlice Groups
+#define MAX_THREADS_NUM			4	// assume to support up to 4 logical cores(threads)
+
+#define ALIGN_RBSP_LEN_FIX		4
+
+#define PADDING_LENGTH			32 // reference extension
+#define INTPEL_NEEDED_MARGIN	(3)  // for safe sub-pel MC
+
+#define I420_PLANES				3
+
+// Condition of fix unexpected coding violation in case actual compress ratio of coding is less than 2:1 (compress_ratio=i420_base_picture_size/actual_size_of_coded_bs).
+// Coding picture resolution as SubQcif or above size compress ration using 2:1 by default, such normal case regards as ratio can meet 2:1 requirement.
+// Per specific cases, i.e, 16x16 picture size, the compress ration usually less than 2:1, so which results in unexpected violation due not large enough of frame bs pBuffer size.
+// Here SubQcif just like thredshold to distinguish between normal cases and abnormal cases by resolution size from products usage.
+#define COMPRESS_RATION_NORMAL_THR			(0.5f)	// 0.5f, 0.375f, 0.25f
+#define COMPRESS_RATION_ABNORMAL_THR		(1.0f)	// ensure (1.0f >= COMPRESS_RATION_ABNORMAL_THR > COMPRESS_RATION_NORMAL_THR)
+#define RESOLUTION_NORMAL_CX_THR			(128)
+#define RESOLUTION_NORMAL_CY_THR			(96)
+#define COMPRESS_RATIO_DECIDED_BY_RESOLUTION(_cx, _cy)	\
+	(((_cx) >= RESOLUTION_NORMAL_CX_THR && (_cy) >= RESOLUTION_NORMAL_CY_THR) ? \
+	COMPRESS_RATION_NORMAL_THR :	\
+	COMPRESS_RATION_ABNORMAL_THR)
+
+#if !defined(SSEI_BUFFER_SIZE)
+#define SSEI_BUFFER_SIZE	128
+#endif//SSEI_BUFFER_SIZE
+
+#if !defined(SPS_BUFFER_SIZE)
+#define SPS_BUFFER_SIZE		32
+#endif//SPS_BUFFER_SIZE
+
+#if !defined(PPS_BUFFER_SIZE)
+#define PPS_BUFFER_SIZE		16
+#endif//PPS_BUFFER_SIZE
+
+
+#if defined(NUM_SPATIAL_LAYERS_CONSTRAINT)
+#define MAX_DEPENDENCY_LAYER		MAX_SPATIAL_LAYER_NUM	// Maximal dependency layer
+#else
+#define MAX_DEPENDENCY_LAYER		8	// Maximal dependency layer
+#endif//NUM_SPATIAL_LAYERS_CONSTRAINT
+
+//The max temporal level support is equal or less than MAX_TEMPORAL_LAYER_NUM defined @ codec_app_def.h
+#define MAX_TEMPORAL_LEVEL		MAX_TEMPORAL_LAYER_NUM	// Maximal temporal level
+
+#if defined(NUM_QUALITY_LAYERS_CONSTRAINT)
+#define MAX_QUALITY_LEVEL		MAX_QUALITY_LAYER_NUM		// Maximal quality level
+#else
+#define MAX_QUALITY_LEVEL		16	// Maximal quality level
+#endif//NUM_QUALITY_LAYERS_CONSTRAINT
+
+#if defined(MAX_GOP_SIZE)
+#undef MAX_GOP_SIZE
+#endif//MAX_GOP_SIZE
+#define MAX_GOP_SIZE	(1<<(MAX_TEMPORAL_LEVEL-1))
+
+#define MAX_SHORT_REF_COUNT		(MAX_GOP_SIZE>>1) // 16 in standard, maximal count number of short reference pictures
+#define LONG_TERM_REF_NUM       2
+#define MAX_LONG_REF_COUNT		2 // 16 in standard, maximal count number of long reference pictures
+#define MAX_REF_PIC_COUNT		16 // 32 in standard, maximal Short + Long reference pictures
+#define MIN_REF_PIC_COUNT		1		// minimal count number of reference pictures, 1 short + 2 key reference based?
+//#define TOTAL_REF_MINUS_HALF_GOP	1	// last t0 in last gop
+#define MAX_MMCO_COUNT			66
+
+// adjusted numbers reference picture functionality related definition
+#define MAX_REFERENCE_MMCO_COUNT_NUM		4	// adjusted MAX_MMCO_COUNT(66 in standard) definition per encoder design
+#define MAX_REFERENCE_REORDER_COUNT_NUM		2	// adjusted MAX_REF_PIC_COUNT(32 in standard) for reference reordering definition per encoder design
+#define MAX_REFERENCE_PICTURE_COUNT_NUM		(MAX_SHORT_REF_COUNT+MAX_LONG_REF_COUNT)	// <= MAX_REF_PIC_COUNT, memory saved if <
+
+#define BASE_QUALITY_ID			0
+#define BASE_DEPENDENCY_ID		0
+#define BASE_DQ_ID				0
+#define MAX_DQ_ID				((uint8_t)-1)
+#define MAX_DQ_LAYER_NUM		(MAX_DEPENDENCY_LAYER/**MAX_QUALITY_LEVEL*/)
+
+#define UNAVAILABLE_DQ_ID		((uint8_t)(-1))
+#define LAYER_NUM_EXCHANGEABLE	2
+
+#define MAX_NAL_UNIT_NUM_IN_AU	256	// predefined maximal number of NAL Units in an access unit
+#define MAX_ACCESS_UINT_CAPACITY	(1<<20)	// Maximal AU capacity in bytes: 1024 KB predefined
+#define MAX_ACCESS_UNIT_CACHE_NUM	2	// Maximal Access Unit(AU) cache number to be processed, denote current AU and the next coming AU.
+enum{
+	CUR_AU_IDX	= 0,			// index symbol for current access unit
+	SUC_AU_IDX	= 1				// index symbol for successive access unit
+};
+
+enum {
+	BASE_MB = 0,
+		AVC_REWRITE_ENHANCE_MB = 1,
+		NON_AVC_REWRITE_ENHANCE_MB =2
+};
+
+#endif//WELS_CONSTANCE_H__
--- /dev/null
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -1,0 +1,226 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// wels_func_ptr_def.h
+#ifndef WELS_ENCODER_FUNCTION_POINTERS_DEFINITION_H_
+#define WELS_ENCODER_FUNCTION_POINTERS_DEFINITION_H_
+
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "svc_enc_macroblock.h"
+#include "mb_cache.h"
+#include "slice.h"
+#include "svc_enc_slice_segment.h"
+#include "svc_enc_frame.h"
+#include "expand_pic.h"
+#include "rc.h"
+
+namespace WelsSVCEnc {
+
+typedef struct TagWelsFuncPointerList SWelsFuncPtrList;
+
+typedef void (*PSetMemoryZero)(void *pDst, int32_t iSize);
+typedef void (*PDctFunc)( int16_t *pDct, uint8_t *pSample1, int32_t iStride1, uint8_t *pSample2, int32_t iStride2 );
+
+typedef void (*PCopyFunc)( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS );
+typedef void (*PIDctFunc)(uint8_t *pRec, int32_t iStride, uint8_t *pPred, int32_t iPredStride, int16_t *pRes);
+typedef void (*PDeQuantizationFunc)(int16_t *pRes, const uint16_t* kpQpTable);
+typedef void (*PDeQuantizationHadamardFunc)(int16_t *pRes, const uint16_t kuiMF);
+typedef int32_t (*PGetNoneZeroCountFunc)(int16_t *pLevel);
+
+typedef void (*PScanFunc)(int16_t* pLevel, int16_t *pDct);
+typedef int32_t (*PCalculateSingleCtrFunc)(int16_t *pDct);
+
+typedef void (*PTransformHadamard4x4Func)( int16_t *pLumaDc, int16_t *pDct);
+typedef void (*PQuantizationFunc)(int16_t *pDct, int16_t* pFF,  int16_t *pMF);
+typedef void (*PQuantizationMaxFunc)(int16_t *pDct, int16_t* pFF,  int16_t *pMF, int16_t *pMax);
+typedef void (*PQuantizationDcFunc)(int16_t *pDct, int16_t iFF,  int16_t iMF);
+typedef BOOL_T (*PQuantizationSkipFunc)(int16_t *pDct, int16_t iFF,  int16_t iMF);
+typedef int32_t (*PQuantizationHadamardFunc)(int16_t *pRes, const int16_t kiFF, int16_t iMF, int16_t * pDct, int16_t * pBlock);
+
+typedef void (*PWelsMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+							  SMVUnitXY mv, int32_t iWidth, int32_t iHeight);
+
+typedef void (*PWelsLumaHalfpelMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, 
+                                   int32_t iWidth, int32_t iHeight);
+typedef void (*PWelsLumaQuarpelMcFunc) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+typedef void (*PWelsSampleAveragingFunc) ( uint8_t *, int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
+
+typedef struct TagMcFunc{
+	PWelsLumaHalfpelMcFunc      pfLumaHalfpelHor;
+	PWelsLumaHalfpelMcFunc      pfLumaHalfpelVer;
+	PWelsLumaHalfpelMcFunc      pfLumaHalfpelCen;
+	PWelsMcFunc                         pfChromaMc;
+
+	PWelsLumaQuarpelMcFunc     *pfLumaQuarpelMc;
+	PWelsSampleAveragingFunc   *pfSampleAveraging;
+}SMcFunc;
+
+typedef void (*PLumaDeblockingLT4Func)( uint8_t *iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc );
+typedef void (*PLumaDeblockingEQ4Func)(  uint8_t *iSampleY, int32_t iStride, int32_t iAlpha, int32_t iBeta );
+typedef void (*PChromaDeblockingLT4Func)( uint8_t *iSampleCb, uint8_t *iSampleCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc );
+typedef void (*PChromaDeblockingEQ4Func)(  uint8_t *iSampleCb, uint8_t *iSampleCr, int32_t iStride, int32_t iAlpha, int32_t iBeta  );
+
+typedef struct tagDeblockingFunc {
+	PLumaDeblockingLT4Func    pfLumaDeblockingLT4Ver;
+	PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Ver;
+	PLumaDeblockingLT4Func    pfLumaDeblockingLT4Hor;
+	PLumaDeblockingEQ4Func    pfLumaDeblockingEQ4Hor;
+
+	PChromaDeblockingLT4Func  pfChromaDeblockingLT4Ver;
+	PChromaDeblockingEQ4Func  pfChromaDeblockingEQ4Ver;
+	PChromaDeblockingLT4Func  pfChromaDeblockingLT4Hor;
+	PChromaDeblockingEQ4Func  pfChromaDeblockinEQ4Hor;
+} DeblockingFunc;
+
+typedef  void (*PSetNoneZeroCountZeroFunc) (int8_t * pNonZeroCount );
+
+typedef int32_t (*PIntraFineMdFunc)(void* pEncCtx, void * pWelsMd, SMB* pCurMb, SMbCache *pMbCache); 
+typedef void (*PInterFineMdFunc)(void* pEncCtx, void* pWelsMd, SSlice *slice, SMB* pCurMb, int32_t bestCost );
+typedef BOOL_T (*PInterMdFirstIntraModeFunc)(void* pEncCtx, void* pWelsMd, SMB* pCurMb, SMbCache *pMbCache);
+
+typedef void (*PMotionSearchFunc) ( SWelsFuncPtrList *pFuncList, void* pCurDqLayer, void* pMe, void* pSlice );// here after reset all function pointers, will set as right parameter type
+typedef void (*PFillInterNeighborCacheFunc) (SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag);
+typedef void (*PAccumulateSadFunc) (uint32_t *pSumDiff, int32_t *pGomForegroundBlockNum, int32_t *iSad8x8, int8_t *pVaaBgMbFlag);//for RC
+typedef BOOL_T (*PDynamicSlicingStepBackFunc)	( void* pEncCtx, void* pSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, SDynamicSlicingStack *pDynamicSlicingStack );// 2010.8.17
+
+typedef bool_t (*PInterMdBackgroundDecisionFunc) ( void* pEncCtx, void* pWelsMd, SSlice *slice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* pKeepPskip );
+typedef void (*PInterMdBackgroundInfoUpdateFunc) ( SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t bFlag, const int32_t kiRefPictureType );
+
+typedef void (*PInterMdFunc) ( void* pEncCtx, void* pWelsMd, SSlice *slice, SMB* pCurMb, SMbCache *pMbCache );
+
+typedef int32_t  (*PSampleSadSatdCostFunc) ( uint8_t *, int32_t, uint8_t *, int32_t );
+typedef void (*PSample4SadCostFunc) ( uint8_t *, int32_t, uint8_t*, int32_t, int32_t* );
+typedef int32_t (*PIntraPred4x4Combined3Func)(uint8_t *, int32_t, uint8_t *, int32_t, uint8_t *, int32_t *, int32_t, int32_t, int32_t);
+typedef int32_t (*PIntraPred16x16Combined3Func)(uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*);
+typedef int32_t (*PIntraPred8x8Combined3Func)(uint8_t *, int32_t, uint8_t *, int32_t, int32_t*, int32_t, uint8_t*,uint8_t*,uint8_t*);
+#define     MAX_BLOCK_TYPE 5 // prev 7
+typedef struct TagSampleDealingFunc {
+	PSampleSadSatdCostFunc            pfSampleSad[MAX_BLOCK_TYPE];
+	PSampleSadSatdCostFunc            pfSampleSatd[MAX_BLOCK_TYPE];
+	PSample4SadCostFunc                 pfSample4Sad[MAX_BLOCK_TYPE];
+	PIntraPred4x4Combined3Func      pfIntra4x4Combined3Satd;
+	PIntraPred16x16Combined3Func  pfIntra16x16Combined3Satd;
+	PIntraPred16x16Combined3Func  pfIntra16x16Combined3Sad;
+	PIntraPred8x8Combined3Func      pfIntra8x8Combined3Satd;
+	PIntraPred8x8Combined3Func      pfIntra8x8Combined3Sad;
+
+	PSampleSadSatdCostFunc            *pfMdCost;
+	PSampleSadSatdCostFunc            *pfMeCost;
+	PIntraPred16x16Combined3Func   pfIntra16x16Combined3;
+	PIntraPred8x8Combined3Func       pfIntra8x8Combined3;
+	PIntraPred4x4Combined3Func       pfIntra4x4Combined3;
+} SSampleDealingFunc;
+typedef void (*PGetIntraPredFunc )(uint8_t *pPrediction, uint8_t *pRef, const int32_t kiStride);
+
+typedef int32_t (*PGetVarianceFromIntraVaaFunc)( uint8_t *pSampelY, const int32_t kiStride );
+typedef uint8_t (*PGetMbSignFromInterVaaFunc)( int32_t *pSad8x8 );
+typedef void (*PUpdateMbMvFunc)( SMVUnitXY *pMvUnit, const SMVUnitXY ksMv );
+
+struct TagWelsFuncPointerList
+{
+	PExpandPictureFunc			pfExpandLumaPicture;
+	PExpandPictureFunc			pfExpandChromaPicture[2];// 0: for chroma unalignment && width_uv >= 16; 1: for chroma alignment && width_uv >= 16;
+    	
+    PFillInterNeighborCacheFunc       pfFillInterNeighborCache;
+
+	PGetVarianceFromIntraVaaFunc	pfGetVarianceFromIntraVaa;
+	PGetMbSignFromInterVaaFunc	pfGetMbSignFromInterVaa;
+	PUpdateMbMvFunc					    pfUpdateMbMv;
+	PInterMdFirstIntraModeFunc      pfFirstIntraMode; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
+	PIntraFineMdFunc                     pfIntraFineMd;          //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
+	PInterFineMdFunc                     pfInterFineMd;          //svc_encode_slice.c svc_base_layer_md.c
+	PInterMdFunc                           pfInterMd;
+
+	PInterMdBackgroundDecisionFunc          pfInterMdBackgroundDecision;
+	PInterMdBackgroundInfoUpdateFunc      pfInterMdBackgroundInfoUpdate;
+
+	SMcFunc				        sMcFuncs;
+	SSampleDealingFunc     sSampleDealingFuncs;
+	PGetIntraPredFunc 		pfGetLumaI16x16Pred[I16_PRED_DC_A];
+	PGetIntraPredFunc 		pfGetLumaI4x4Pred[I4_PRED_A];		
+	PGetIntraPredFunc 		pfGetChromaPred[C_PRED_A];		
+	PMotionSearchFunc	    pfMotionSearch; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
+
+	PCopyFunc      pfCopy16x16Aligned;		//svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
+	PCopyFunc      pfCopy16x16NotAligned;	//md.c
+	PCopyFunc      pfCopy8x8Aligned;		//svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c md.c
+	PCopyFunc	  pfCopy16x8NotAligned;	//for MeRefineFracPixel 16x8 based
+	PCopyFunc	  pfCopy8x16Aligned;		//for MeRefineFracPixel 8x16 based
+
+	//svc_encode_mb.c encode_mb_aux.c
+	PDctFunc					pfDctT4;
+	PDctFunc    		        pfDctFourT4;
+
+	PCalculateSingleCtrFunc				pfCalculateSingleCtr4x4;     
+	PScanFunc				pfScan4x4;		//DC/AC
+    PScanFunc				pfScan4x4Ac;
+
+	PQuantizationFunc				        pfQuantization4x4;       
+	PQuantizationFunc				        pfQuantizationFour4x4;  
+    PQuantizationDcFunc			        pfQuantizationDc4x4; 
+	PQuantizationMaxFunc		        pfQuantizationFour4x4Max; 
+	PQuantizationHadamardFunc		pfQuantizationHadamard2x2;
+	PQuantizationSkipFunc		        pfQuantizationHadamard2x2Skip;
+
+	PTransformHadamard4x4Func	 pfTransformHadamard4x4Dc;
+
+	PGetNoneZeroCountFunc		      pfGetNoneZeroCount;
+
+	PDeQuantizationFunc				      pfDequantization4x4;  
+	PDeQuantizationFunc			          pfDequantizationFour4x4; 
+	PDeQuantizationHadamardFunc	  pfDequantizationIHadamard4x4;
+	PIDctFunc				                      pfIDctFourT4;
+	PIDctFunc				                      pfIDctT4;
+	PIDctFunc				                      pfIDctI16x16Dc;
+
+	
+
+	// OPTI: if MT under diff uiSliceMode, need change here
+	//PDynamicSlicingStepBackFunc	dynslc_funcpointer_stepback;//svc_encode_slice.c 
+	//DYNSLC_LNGTH_CRTL		dynslc_funcpointer_slcsize_ctrl;
+    
+    /* For Deblocking */
+	DeblockingFunc                         pfDeblocking;
+	PSetNoneZeroCountZeroFunc     pfSetNZCZero;
+
+	SWelsRcFunc					    pfRc;
+	PAccumulateSadFunc         pfAccumulateSadForRc;
+
+    PSetMemoryZero				pfSetMemZeroSize8;			// for size is times to 8
+	PSetMemoryZero				pfSetMemZeroSize64Aligned16;			// for size is times of 64, and address is align to 16
+	PSetMemoryZero				pfSetMemZeroSize64;			// for size is times of 64, and don't know address is align to 16 or not
+};
+
+}	//end of namespace WelsSVCEnc {
+
+#endif//WELS_ENCODER_FUNCTION_POINTERS_DEFINITION_H_
--- /dev/null
+++ b/codec/encoder/core/inc/wels_preprocess.h
@@ -1,0 +1,153 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	wels_preprocess.h
+ *
+ * \brief	interface of video pre-process plugins
+ *
+ * \date	03/15/2011
+ *
+ * \description : this class is designed as an interface to unify video pre-processing 
+ *                class implement sets such as denoise,colorspace conversion etc...
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_PREPROCESS_H
+#define WELS_PREPROCESS_H
+
+#include "typedefs.h"
+#include "picture.h"
+#include "wels_const.h"
+#include "IWelsVP.h"
+#include "param_svc.h"
+
+namespace WelsSVCEnc {
+
+typedef  struct
+{
+	SPicture	*pScaledInputPicture;
+	int32_t		iScaledWidth[MAX_DEPENDENCY_LAYER];
+	int32_t     iScaledHeight[MAX_DEPENDENCY_LAYER];
+} Scaled_Picture;
+
+typedef struct 
+{
+	SVAACalcResult		sVaaCalcInfo;
+	SAdaptiveQuantizationParam sAdaptiveQuantParam;
+	SComplexityAnalysisParam sComplexityAnalysisParam;
+
+	int32_t			iPicWidth;			// maximal iWidth of picture in samples for svc coding
+	int32_t			iPicHeight;			// maximal iHeight of picture in samples for svc coding
+	int32_t         iPicStride;         //luma
+	int32_t			iPicStrideUV;
+
+	uint8_t         *pRefY; //pRef	
+	uint8_t         *pCurY; //cur
+	uint8_t         *pRefU; //pRef	
+	uint8_t         *pCurU; //cur
+	uint8_t         *pRefV; //pRef	
+	uint8_t         *pCurV; //cur
+
+	int8_t			*pVaaBackgroundMbFlag;
+	uint8_t         uiValidLongTermPicIdx;
+	uint8_t         uiMarkLongTermPicIdx;
+
+	bool_t          bSceneChangeFlag;
+	bool_t          bIdrPeriodFlag;
+} SVAAFrameInfo;
+
+class CWelsLib
+{
+public:
+	CWelsLib(void *pEncCtx);
+	virtual  ~CWelsLib();	
+
+	int32_t CreateIface(void **pEncCtx);
+	int32_t DestroyIface(void *pEncCtx);
+
+protected:
+	void *QueryFunction(const str_t *pName);
+
+private:
+	void *m_pVpLib;
+	void *m_pInterface[2];
+};
+
+class CWelsPreProcess
+{
+public:
+	CWelsPreProcess(void *pEncCtx);
+	virtual  ~CWelsPreProcess();
+
+public:
+	int32_t WelsPreprocessReset ( void *pEncCtx );
+	int32_t WelsPreprocessStep1( void *pEncCtx, const SSourcePicture **kppSrcPicList, const int32_t kiConfiguredLayerNum );
+	int32_t WelsPreprocessStep3( void *pEncCtx, const int32_t kiDIdx );
+
+private:
+	int32_t WelsPreprocessCreate();
+	int32_t WelsPreprocessDestroy();
+	int32_t InitLastSpatialPictures( void *pEncCtx );
+
+private:
+	int32_t SingleLayerPreprocess( void *pEncCtx, const SSourcePicture *kpSrc, Scaled_Picture * m_sScaledPicture );
+	int32_t MultiLayerPreprocess( void *pEncCtx, const SSourcePicture **kppSrcPicList, const int32_t kiSpatialNum );
+
+	void	BilateralDenoising ( SPicture *pSrc, const int32_t iWidth, const int32_t iHeight );
+	bool_t  DetectSceneChange( SPicture *pCurPicture, SPicture *pRefPicture );
+	int32_t DownsamplePadding( SPicture *pSrc, SPicture *pDstPic,  int32_t iSrcWidth, int32_t iSrcHeight,
+		                        int32_t iShrinkWidth, int32_t iShrinkHeight, int32_t iTargetWidth, int32_t iTargetHeight );
+
+	void    VaaCalculation( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture, bool_t bCalculateSQDiff, bool_t bCalculateVar, bool_t bCalculateBGD );
+	void    BackgroundDetection( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture, bool_t bDetectFlag );
+	void    AdaptiveQuantCalculation( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture );
+	void    AnalyzePictureComplexity( void *pCtx, SPicture *pCurPicture, SPicture *pRefPicture, const int32_t kiDependencyId, const bool_t kbCalculateBGD );
+	void    Padding(uint8_t *pSrcY, uint8_t *pSrcU, uint8_t *pSrcV, int32_t iStrideY, int32_t iStrideUV,
+		            int32_t iActualWidth, int32_t iPaddingWidth, int32_t iActualHeight, int32_t iPaddingHeight);
+    void    SetRefMbType(void *pCtx, uint32_t **pRefMbTypeArray, int32_t iRefPicType);
+
+ 	int32_t ColorspaceConvert( SWelsSvcCodingParam *pSvcParam, SPicture *pDstPic, const SSourcePicture *kpSrc, const int32_t kiWidth, const int32_t kiHeight );
+	void WelsMoveMemoryWrapper(SWelsSvcCodingParam * pSvcParam, SPicture *pDstPic, const SSourcePicture *kpSrc, const int32_t kiWidth, const int32_t kiHeight );
+
+private:
+	Scaled_Picture  m_sScaledPicture;
+	SPicture		*m_pLastSpatialPicture[MAX_DEPENDENCY_LAYER][2];	
+	IWelsVP         *m_pInterfaceVp;	
+	CWelsLib        *m_pEncLib;
+	void            *m_pEncCtx;
+	bool_t          m_bInitDone;
+	bool_t          m_bOfficialBranch;
+};
+
+}
+
+#endif
--- /dev/null
+++ b/codec/encoder/core/src/au_set.cpp
@@ -1,0 +1,514 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	au_set.c
+ *
+ * \brief	Interfaces introduced in Access Unit level based writer
+ *
+ * \date	05/18/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include <assert.h>
+#include "au_set.h"
+#include "svc_enc_golomb.h"
+namespace WelsSVCEnc {
+static const uint32_t g_kuiMaxDPBx2AtLevel[52] = // *2 on the basic of Annex A, Table A-1, for int32_t type
+{
+	0,		0,		0,		0,		0,	0,	0,	0,	0,	0, //0~9
+	297,	675,	1782,	1782,	0,	0,	0,	0,	0,	0, //10, 11, 12, 13
+	1782,	3564,	6075,	0,		0,	0,	0,	0,	0,	0, //20, 21, 22
+	6075,	13500,	15360,	0,		0,	0,	0,	0,	0,	0, //30, 31, 32
+	24576,	24576,	26112,	0,		0,	0,	0,	0,	0,	0, //40, 41, 42
+	82800,	138240											//50, 51
+};
+
+
+#define LEVEL_NUMBER 16
+
+typedef struct TagLevelLimit
+{
+	uint8_t iLevelIdc;
+	uint32_t uiMaxMbPS; // Max MBs processing speed
+	uint32_t uiMaxFS; // Max Frame size
+	uint32_t uiMaxDPBMB; //Max DPB MB Size
+	uint32_t uiMaxBR; //Max Bitrate
+} SLevelLimit;
+
+const SLevelLimit g_ksLevelLimit[LEVEL_NUMBER] =
+{
+  { 10,   1485,    99,	  396,     64 },                 //10
+  { 9,    1485,    99,	  396,    128 },                 //9 (1b)
+  { 11,   3000,   396,	  900,    192 },                 //11
+  { 12,   6000,   396,	 2376,    384 },                 //12
+  { 13,  11880,   396,	 2376,    768 },                 //13
+
+  { 20,  11880,   396,    2376,   2000 },                 //20
+  { 21,  19800,   792,    4752,   4000 },                 //21
+  { 22,  20250,  1620,    8100,   4000 },                 //22
+
+  { 30,  40500,  1620,    8100,  10000 },                 //30
+  { 31, 108000,  3600,   18000,  14000 },                 //31
+  { 32, 216000,  5120,   20480,  20000 },                 //32
+
+  { 40, 245760,  8192,   32768,  20000 },                 //40
+  { 41, 245760,  8192,   32768,  50000 },                 //41
+  { 42, 491520,  8192,   34816,  50000 },                 //42
+
+  { 50, 589824, 22080,  110400, 135000 },                 //50
+  { 51, 983040, 36864,  184320, 240000 }                  //51
+};
+
+static inline int32_t WelsCheckLevelLimitation( const SWelsSPS* kpSps, const SLevelLimit *kpLevelLimit, float fFrameRate, int32_t iTargetBitRate )
+{
+	uint32_t uiPicWidthInMBs = kpSps->iMbWidth;
+	uint32_t uiPicHeightInMBs = kpSps->iMbHeight;
+	uint32_t uiPicInMBs = uiPicWidthInMBs * uiPicHeightInMBs;
+	uint32_t uiNumRefFrames = kpSps->iNumRefFrames;
+
+	if( kpLevelLimit->uiMaxMbPS < ( uint32_t ) ( uiPicInMBs * fFrameRate ) )
+		return 0;
+	if( kpLevelLimit->uiMaxFS < uiPicInMBs )
+		return 0;
+	if( ( kpLevelLimit->uiMaxFS << 3 ) < ( uiPicWidthInMBs * uiPicWidthInMBs ) )
+		return 0;
+	if( ( kpLevelLimit->uiMaxFS << 3 ) < ( uiPicHeightInMBs * uiPicHeightInMBs ) )
+		return 0;
+	if( kpLevelLimit->uiMaxDPBMB < uiNumRefFrames * uiPicInMBs )
+		return 0;
+	if( iTargetBitRate && ( (int32_t) kpLevelLimit->uiMaxBR  * 1200 ) < iTargetBitRate ) //RC enabled, considering bitrate constraint
+		return 0;
+	//add more checks here if needed in future
+
+	return 1;
+
+}
+
+static inline int32_t WelsGetLevelIdc( const SWelsSPS* kpSps, float fFrameRate, int32_t iTargetBitRate )
+{	
+	int32_t iOrder;
+	for( iOrder = 0; iOrder < LEVEL_NUMBER; iOrder++ )
+	{
+		if( WelsCheckLevelLimitation(kpSps, &(g_ksLevelLimit[iOrder]), fFrameRate, iTargetBitRate) )
+		{
+			return (int32_t) ( g_ksLevelLimit[iOrder].iLevelIdc );
+		}
+	}
+	return 51; //final decision: select the biggest level
+}
+
+
+/*! 
+ *************************************************************************************
+ * \brief	to set Sequence Parameter Set (SPS)
+ *
+ * \param 	pSps 	SWelsSPS to be wrote, update iSpsId dependency
+ * \param	pBitStringAux		bitstream writer auxiliary 
+ *
+ * \return	0 - successed
+ *	    	1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is SPS.
+ *************************************************************************************
+ */
+int32_t WelsWriteSpsSyntax( SWelsSPS *pSps, SBitStringAux *pBitStringAux, int32_t* pSpsIdDelta )
+{
+	SBitStringAux *pLocalBitStringAux = pBitStringAux;
+
+	assert( pSps != NULL && pBitStringAux != NULL );			
+
+	BsWriteBits( pLocalBitStringAux, 8, pSps->uiProfileIdc );
+
+	BsWriteOneBit( pLocalBitStringAux, pSps->bConstraintSet0Flag );	// bConstraintSet0Flag
+	BsWriteOneBit( pLocalBitStringAux, pSps->bConstraintSet1Flag );	// bConstraintSet1Flag
+	BsWriteOneBit( pLocalBitStringAux, pSps->bConstraintSet2Flag );	// bConstraintSet2Flag
+	BsWriteOneBit( pLocalBitStringAux, 0/*pSps->bConstraintSet3Flag*/ );	// bConstraintSet3Flag
+	BsWriteBits( pLocalBitStringAux, 4, 0 );							// reserved_zero_4bits, equal to 0
+	BsWriteBits( pLocalBitStringAux, 8, pSps->iLevelIdc );				// iLevelIdc
+	BsWriteUE( pLocalBitStringAux, pSps->uiSpsId + pSpsIdDelta[pSps->uiSpsId] );					    // seq_parameter_set_id
+
+	if ( PRO_SCALABLE_BASELINE == pSps->uiProfileIdc || PRO_SCALABLE_HIGH == pSps->uiProfileIdc ||
+		PRO_HIGH == pSps->uiProfileIdc || PRO_HIGH10 == pSps->uiProfileIdc ||
+		PRO_HIGH422 == pSps->uiProfileIdc || PRO_HIGH444 == pSps->uiProfileIdc ||
+		PRO_CAVLC444 == pSps->uiProfileIdc || 44 == pSps->uiProfileIdc )
+	{
+		BsWriteUE( pLocalBitStringAux, 1 ); //uiChromaFormatIdc, now should be 1
+		BsWriteUE( pLocalBitStringAux, 0); //uiBitDepthLuma
+		BsWriteUE( pLocalBitStringAux, 0); //uiBitDepthChroma
+		BsWriteOneBit( pLocalBitStringAux, 0); //qpprime_y_zero_transform_bypass_flag
+		BsWriteOneBit( pLocalBitStringAux, 0); //seq_scaling_matrix_present_flag
+	}
+
+	BsWriteUE( pLocalBitStringAux, pSps->uiLog2MaxFrameNum - 4 );	// log2_max_frame_num_minus4
+	BsWriteUE( pLocalBitStringAux, 0/*pSps->uiPocType*/ );		    // pic_order_cnt_type
+	BsWriteUE( pLocalBitStringAux, pSps->iLog2MaxPocLsb - 4 );	// log2_max_pic_order_cnt_lsb_minus4
+
+	BsWriteUE( pLocalBitStringAux, pSps->iNumRefFrames );		// max_num_ref_frames
+	BsWriteOneBit( pLocalBitStringAux, true/*pSps->bGapsInFrameNumValueAllowedFlag*/ );	// bGapsInFrameNumValueAllowedFlag
+	BsWriteUE( pLocalBitStringAux, pSps->iMbWidth - 1 );		// pic_width_in_mbs_minus1
+	BsWriteUE( pLocalBitStringAux, pSps->iMbHeight - 1 );		// pic_height_in_map_units_minus1
+	BsWriteOneBit( pLocalBitStringAux, true/*pSps->bFrameMbsOnlyFlag*/ );	// bFrameMbsOnlyFlag
+
+	BsWriteOneBit( pLocalBitStringAux, 0/*pSps->bDirect8x8InferenceFlag*/ );	// direct_8x8_inference_flag
+	BsWriteOneBit( pLocalBitStringAux, pSps->bFrameCroppingFlag );	// bFrameCroppingFlag
+	if ( pSps->bFrameCroppingFlag )
+	{
+		BsWriteUE( pLocalBitStringAux, pSps->sFrameCrop.iCropLeft );	// frame_crop_left_offset
+		BsWriteUE( pLocalBitStringAux, pSps->sFrameCrop.iCropRight );	// frame_crop_right_offset
+		BsWriteUE( pLocalBitStringAux, pSps->sFrameCrop.iCropTop );	// frame_crop_top_offset
+		BsWriteUE( pLocalBitStringAux, pSps->sFrameCrop.iCropBottom );	// frame_crop_bottom_offset
+	}
+
+	BsWriteOneBit( pLocalBitStringAux, 0/*pSps->bVuiParamPresentFlag*/ );	// vui_parameters_present_flag
+	
+	return 0;
+}
+
+
+int32_t WelsWriteSpsNal( SWelsSPS *pSps, SBitStringAux *pBitStringAux, int32_t* pSpsIdDelta)
+{
+	WelsWriteSpsSyntax( pSps, pBitStringAux, pSpsIdDelta );
+
+	BsRbspTrailingBits( pBitStringAux );
+
+	BsFlush( pBitStringAux );
+
+	return 0;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	to write SubSet Sequence Parameter Set
+ *
+ * \param 	sub_sps		subset pSps parsed
+ * \param	pBitStringAux		bitstream writer auxiliary 
+ *
+ * \return	0 - successed
+ *		    1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is SubSet SPS.
+ *************************************************************************************
+ */
+
+int32_t WelsWriteSubsetSpsSyntax( SSubsetSps *pSubsetSps, SBitStringAux *pBitStringAux , int32_t* pSpsIdDelta )
+{
+	SWelsSPS *pSps = &pSubsetSps->pSps;
+
+	WelsWriteSpsSyntax( pSps, pBitStringAux, pSpsIdDelta );
+
+	if ( pSps->uiProfileIdc == PRO_SCALABLE_BASELINE || pSps->uiProfileIdc == PRO_SCALABLE_HIGH ){
+		SSpsSvcExt *pSubsetSpsExt = &pSubsetSps->sSpsSvcExt;
+		
+		BsWriteOneBit( pBitStringAux, true/*pSubsetSpsExt->bInterLayerDeblockingFilterCtrlPresentFlag*/ );
+		BsWriteBits( pBitStringAux, 2, pSubsetSpsExt->iExtendedSpatialScalability );
+			BsWriteOneBit( pBitStringAux, 0/*pSubsetSpsExt->uiChromaPhaseXPlus1Flag*/ );
+			BsWriteBits( pBitStringAux, 2, 1/*pSubsetSpsExt->uiChromaPhaseYPlus1*/ );
+		if ( pSubsetSpsExt->iExtendedSpatialScalability == 1 ){
+				BsWriteOneBit( pBitStringAux, 0/*pSubsetSpsExt->uiSeqRefLayerChromaPhaseXPlus1Flag*/ );
+				BsWriteBits( pBitStringAux, 2, 1/*pSubsetSpsExt->uiSeqRefLayerChromaPhaseYPlus1*/ );
+			BsWriteSE( pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.left_offset*/ ); 
+			BsWriteSE( pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.top_offset*/ ); 
+			BsWriteSE( pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.right_offset*/ ); 
+			BsWriteSE( pBitStringAux, 0/*pSubsetSpsExt->sSeqScaledRefLayer.bottom_offset*/ );
+		}
+		BsWriteOneBit( pBitStringAux, pSubsetSpsExt->bSeqTcoeffLevelPredFlag );
+		if ( pSubsetSpsExt->bSeqTcoeffLevelPredFlag ){
+			BsWriteOneBit( pBitStringAux, pSubsetSpsExt->bAdaptiveTcoeffLevelPredFlag );
+		}
+		BsWriteOneBit( pBitStringAux, pSubsetSpsExt->bSliceHeaderRestrictionFlag );
+		
+		BsWriteOneBit( pBitStringAux, false/*pSubsetSps->bSvcVuiParamPresentFlag*/ );
+	}		
+	BsWriteOneBit( pBitStringAux, false/*pSubsetSps->bAdditionalExtension2Flag*/ );
+
+	BsRbspTrailingBits( pBitStringAux );
+
+	BsFlush( pBitStringAux );
+
+	return 0;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	to write Picture Parameter Set (PPS)
+ *
+ * \param 	pPps     	pPps
+ * \param	pBitStringAux		bitstream writer auxiliary 
+ *
+ * \return	0 - successed
+ *	    	1 - failed
+ *
+ * \note	Call it in case EWelsNalUnitType is PPS.
+ *************************************************************************************
+ */
+int32_t WelsWritePpsSyntax( SWelsPPS *pPps, SBitStringAux *pBitStringAux, SParaSetOffset* sPSOVector )
+{
+	SBitStringAux * pLocalBitStringAux = pBitStringAux;
+
+	bool_t bUsedSubset    =  sPSOVector->bPpsIdMappingIntoSubsetsps[pPps->iPpsId];
+	int32_t iParameterSetType = ( bUsedSubset ? PARA_SET_TYPE_SUBSETSPS : PARA_SET_TYPE_AVCSPS );
+
+	BsWriteUE( pLocalBitStringAux, pPps->iPpsId + sPSOVector->sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[pPps->iPpsId] );	
+	BsWriteUE( pLocalBitStringAux, pPps->iSpsId + sPSOVector->sParaSetOffsetVariable[iParameterSetType].iParaSetIdDelta[pPps->iSpsId] );
+	
+#if _DEBUG 
+	//SParaSetOffset use, 110421
+	if ( sPSOVector->bEnableSpsPpsIdAddition )
+	{
+		const int32_t kiTmpSpsIdInBs = pPps->iSpsId + sPSOVector->sParaSetOffsetVariable[iParameterSetType].iParaSetIdDelta[pPps->iSpsId];
+		const int32_t tmp_pps_id_in_bs = pPps->iPpsId + sPSOVector->sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[pPps->iPpsId];
+		assert ( MAX_SPS_COUNT > kiTmpSpsIdInBs );
+		assert ( MAX_PPS_COUNT > tmp_pps_id_in_bs );
+		assert( sPSOVector->sParaSetOffsetVariable[iParameterSetType].bUsedParaSetIdInBs[kiTmpSpsIdInBs] );
+	}
+#endif
+
+	BsWriteOneBit( pLocalBitStringAux, false/*pPps->entropy_coding_mode_flag*/ );
+	BsWriteOneBit( pLocalBitStringAux, false/*pPps->bPicOrderPresentFlag*/ );
+	
+#ifdef DISABLE_FMO_FEATURE
+	BsWriteUE( pLocalBitStringAux, 0/*pPps->uiNumSliceGroups - 1*/ );	
+#else
+	BsWriteUE( pLocalBitStringAux, pPps->uiNumSliceGroups - 1 );	
+	if ( pPps->uiNumSliceGroups > 1 )
+	{
+		uint32_t i, uiNumBits;
+
+		BsWriteUE( pLocalBitStringAux, pPps->uiSliceGroupMapType );
+		
+		switch ( pPps->uiSliceGroupMapType )
+		{
+		case 0:
+			for ( i = 0; i < pPps->uiNumSliceGroups; i ++ )
+			{
+				 BsWriteUE( pLocalBitStringAux, pPps->uiRunLength[i] - 1 );
+			}
+			break;
+		case 2:
+			for ( i = 0; i < pPps->uiNumSliceGroups; i ++ )
+			{
+				BsWriteUE( pLocalBitStringAux, pPps->uiTopLeft[i] );
+				BsWriteUE( pLocalBitStringAux, pPps->uiBottomRight[i] );
+			}
+			break;
+		case 3:
+		case 4:
+		case 5:
+			BsWriteOneBit( pLocalBitStringAux, pPps->bSliceGroupChangeDirectionFlag );
+			BsWriteUE( pLocalBitStringAux, pPps->uiSliceGroupChangeRate - 1 );
+			break;
+		case 6:
+			BsWriteUE( pLocalBitStringAux, pPps->uiPicSizeInMapUnits - 1 );
+			uiNumBits = 0;///////////////////WELS_CEILLOG2(pPps->uiPicSizeInMapUnits);
+			for ( i = 0; i < pPps->uiPicSizeInMapUnits; i ++ ) 
+			{
+				BsWriteBits( pLocalBitStringAux, uiNumBits, pPps->uiSliceGroupId[i] );
+			}
+			break;
+		default:
+			break;
+		}
+	}
+#endif//!DISABLE_FMO_FEATURE
+	
+	BsWriteUE( pLocalBitStringAux, 0/*pPps->uiNumRefIdxL0Active - 1*/ );
+	BsWriteUE( pLocalBitStringAux, 0/*pPps->uiNumRefIdxL1Active - 1*/ );
+	
+	
+	BsWriteOneBit( pLocalBitStringAux, false/*pPps->bWeightedPredFlag*/ );
+	BsWriteBits (pLocalBitStringAux, 2, 0/*pPps->uiWeightedBiPredIdc*/ );
+	
+	BsWriteSE( pLocalBitStringAux, pPps->iPicInitQp - 26 );
+	BsWriteSE( pLocalBitStringAux, pPps->iPicInitQs - 26 );
+	
+	BsWriteSE( pLocalBitStringAux, pPps->uiChromaQpIndexOffset );
+	BsWriteOneBit( pLocalBitStringAux, pPps->bDeblockingFilterControlPresentFlag );
+	BsWriteOneBit( pLocalBitStringAux, false/*pPps->bConstainedIntraPredFlag*/ );
+	BsWriteOneBit( pLocalBitStringAux, false/*pPps->bRedundantPicCntPresentFlag*/ );
+	
+    BsRbspTrailingBits( pLocalBitStringAux );
+
+	BsFlush( pLocalBitStringAux );
+
+	return 0;
+}
+
+static inline bool_t WelsGetPaddingOffset(int32_t iActualWidth, int32_t iActualHeight,  int32_t iWidth, int32_t iHeight, SCropOffset &pOffset)
+{
+	if( (iWidth < iActualWidth) || (iHeight < iActualHeight) )
+		return false;
+
+	// make actual size even
+	iActualWidth -= (iActualWidth & 1);
+	iActualHeight -= (iActualHeight & 1);
+
+	pOffset.iCropLeft = 0;
+	pOffset.iCropRight = (iWidth - iActualWidth)/2;
+	pOffset.iCropTop = 0;
+	pOffset.iCropBottom = (iHeight - iActualHeight)/2;
+
+	return (iWidth>iActualWidth) || (iHeight>iActualHeight);
+}
+
+int32_t WelsInitSps( SWelsSPS *pSps, SDLayerParam *pLayerParam, const uint32_t kuiIntraPeriod, const int32_t kiNumRefFrame,
+					  const uint32_t kuiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc )
+{
+	memset(pSps, 0, sizeof(SWelsSPS));
+
+	pSps->uiSpsId		= kuiSpsId;
+	pSps->iMbWidth	= (pLayerParam->iFrameWidth+15) >> 4;
+	pSps->iMbHeight	= (pLayerParam->iFrameHeight+15) >> 4;
+
+	if ( 0 == kuiIntraPeriod )
+	{
+		//max value of both iFrameNum and POC are 2^16-1, in our encoder, iPOC=2*iFrameNum, so max of iFrameNum should be 2^15-1.--
+		pSps->uiLog2MaxFrameNum = 15;//16; 
+	}
+	else
+	{
+		pSps->uiLog2MaxFrameNum	= 4;
+		while ( (uint32_t)(1 << pSps->uiLog2MaxFrameNum) <= kuiIntraPeriod ) {
+			++ pSps->uiLog2MaxFrameNum;
+		}
+	}
+	pSps->iLog2MaxPocLsb	= 1 + pSps->uiLog2MaxFrameNum;
+
+	pSps->iNumRefFrames	= kiNumRefFrame;	/* min pRef size when fifo pRef operation*/
+
+	if ( kbEnableFrameCropping )
+	{
+		// TODO: get frame_crop_left_offset, frame_crop_right_offset, frame_crop_top_offset, frame_crop_bottom_offset
+		pSps->bFrameCroppingFlag = WelsGetPaddingOffset( pLayerParam->iActualWidth, pLayerParam->iActualHeight, pLayerParam->iFrameWidth, pLayerParam->iFrameHeight, pSps->sFrameCrop );
+	}
+	else
+	{
+		pSps->bFrameCroppingFlag	= false;
+	}
+	
+	pSps->uiProfileIdc	= pLayerParam->uiProfileIdc ? pLayerParam->uiProfileIdc : PRO_BASELINE;
+
+	if( bEnableRc ) //fixed QP condition
+		pSps->iLevelIdc	= WelsGetLevelIdc(pSps, pLayerParam->fOutputFrameRate, pLayerParam->iSpatialBitrate);
+	else
+		pSps->iLevelIdc  = WelsGetLevelIdc(pSps, pLayerParam->fOutputFrameRate, 0); // Set tar_br = 0 to remove the bitrate constraint; a better way is to set actual tar_br as 0
+
+	return 0;
+}
+
+
+int32_t WelsInitSubsetSps( SSubsetSps *pSubsetSps, SDLayerParam *pLayerParam, const uint32_t kuiIntraPeriod, const int32_t kiNumRefFrame, 
+							 const uint32_t kuiSpsId, const bool_t kbEnableFrameCropping, bool_t bEnableRc )
+{
+	SWelsSPS *pSps = &pSubsetSps->pSps;
+
+	memset(pSubsetSps, 0, sizeof(SSubsetSps));
+
+	WelsInitSps( pSps, pLayerParam, kuiIntraPeriod, kiNumRefFrame, kuiSpsId, kbEnableFrameCropping, bEnableRc );
+
+	pSps->uiProfileIdc	= (pLayerParam->uiProfileIdc >= PRO_SCALABLE_BASELINE) ? pLayerParam->uiProfileIdc : PRO_SCALABLE_BASELINE;
+	
+	pSubsetSps->sSpsSvcExt.iExtendedSpatialScalability	= 0;	/* ESS is 0 in default */
+	pSubsetSps->sSpsSvcExt.bAdaptiveTcoeffLevelPredFlag	= false;
+	pSubsetSps->sSpsSvcExt.bSeqTcoeffLevelPredFlag	= false;
+	pSubsetSps->sSpsSvcExt.bSliceHeaderRestrictionFlag = true;
+
+	return 0;
+}
+
+int32_t WelsInitPps(	SWelsPPS *pPps,
+						SWelsSPS *pSps,
+						SSubsetSps *pSubsetSps,
+						const uint32_t kuiPpsId,
+						const bool_t kbDeblockingFilterPresentFlag,
+						const bool_t kbUsingSubsetSps )
+{
+	SWelsSPS *pUsedSps = NULL;
+	if ( pPps == NULL || (pSps == NULL && pSubsetSps == NULL) )
+		return 1;
+	if ( !kbUsingSubsetSps ){
+		assert( pSps != NULL );
+		if ( NULL == pSps )
+			return 1;
+		pUsedSps	= pSps;		
+	}
+	else{
+		assert(pSubsetSps != NULL);
+		if ( NULL == pSubsetSps )
+			return 1;
+		pUsedSps	= &pSubsetSps->pSps;		
+	}
+	
+	/* fill picture parameter set syntax */
+	pPps->iPpsId		= kuiPpsId;
+	pPps->iSpsId		= pUsedSps->uiSpsId;
+#if !defined(DISABLE_FMO_FEATURE)
+	pPps->uiNumSliceGroups =  1;	//param->qos_param.sliceGroupCount;
+    if( pPps->uiNumSliceGroups > 1 )
+    {        
+        pPps->uiSliceGroupMapType = 0;	//param->qos_param.sliceGroupType;
+        if( pPps->uiSliceGroupMapType == 0 )
+        {   
+			uint32_t uiGroup = 0;
+			while (uiGroup < pPps->uiNumSliceGroups) {
+				pPps->uiRunLength[uiGroup]	= 25;
+				++ uiGroup;
+			}
+        }
+        else if( pPps->uiSliceGroupMapType == 2 )
+        {
+			memset(&pPps->uiTopLeft[0], 0, MAX_SLICEGROUP_IDS*sizeof(pPps->uiTopLeft[0]));
+			memset(&pPps->uiBottomRight[0], 0, MAX_SLICEGROUP_IDS*sizeof(pPps->uiBottomRight[0]));
+        }
+        else if( pPps->uiSliceGroupMapType >= 3 &&
+			pPps->uiSliceGroupMapType <= 5 )
+        {
+            pPps->bSliceGroupChangeDirectionFlag = false;
+            pPps->uiSliceGroupChangeRate = 0;
+        }
+        else if( pPps->uiSliceGroupMapType == 6 )
+        {
+            pPps->uiPicSizeInMapUnits = 1;
+			memset(&pPps->uiSliceGroupId[0], 0, MAX_SLICEGROUP_IDS*sizeof(pPps->uiSliceGroupId[0]));
+        }
+    }
+#endif//!DISABLE_FMO_FEATURE
+	
+    pPps->iPicInitQp							= 26;
+    pPps->iPicInitQs							= 26;
+	
+    pPps->uiChromaQpIndexOffset					= 0;		
+	pPps->bDeblockingFilterControlPresentFlag	= kbDeblockingFilterPresentFlag;
+	
+	return 0;
+}
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/cpu.cpp
@@ -1,0 +1,213 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu.c
+ *
+ * \brief	CPU compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+
+#include "cpu.h"
+#include "cpu_core.h"
+
+
+namespace WelsSVCEnc {
+#define    CPU_Vender_AMD    "AuthenticAMD"
+#define    CPU_Vender_INTEL  "GenuineIntel"
+#define    CPU_Vender_CYRIX  "CyrixInstead"
+
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors )
+{
+    uint32_t uiCPU = 0;	
+    uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+	int32_t  CacheLineSize = 0;
+	int8_t   chVenderName[16] = { 0 };	
+	
+    if( !WelsCPUIdVerify() )
+    {
+        /* cpuid is not supported in cpu */
+        return 0;
+    }
+	
+	WelsCPUId( 0, &uiFeatureA, (uint32_t*)&chVenderName[0],(uint32_t*)&chVenderName[8],(uint32_t*)&chVenderName[4] );
+    if( uiFeatureA == 0 )
+    {
+		/* maximum input value for basic cpuid information */
+        return 0;
+    }
+	
+	WelsCPUId( 1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+    if( (uiFeatureD & 0x00800000) == 0 )
+    {
+        /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+        return 0;
+    }
+	
+    uiCPU = WELS_CPU_MMX;
+    if( uiFeatureD & 0x02000000 )
+    {
+        /* SSE technology is identical to AMD MMX extensions */
+        uiCPU |= WELS_CPU_MMXEXT|WELS_CPU_SSE;
+    }
+    if( uiFeatureD & 0x04000000 )
+    {
+        /* SSE2 support here */
+        uiCPU |= WELS_CPU_SSE2;
+    }
+	if ( uiFeatureD & 0x00000001 )
+	{
+		/* x87 FPU on-chip checking */
+		uiCPU |= WELS_CPU_FPU;
+	}
+	if ( uiFeatureD & 0x00008000 )
+	{
+		/* CMOV instruction checking */
+		uiCPU |= WELS_CPU_CMOV;
+	}
+	if ( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) )	// confirmed_safe_unsafe_usage
+	{
+		if ( uiFeatureD & 0x10000000 )
+		{
+			/* Multi-Threading checking: contains of multiple logic processors */
+			uiCPU |= WELS_CPU_HTT;
+		}
+	}	
+
+	if( uiFeatureC & 0x00000001 ){
+		/* SSE3 support here */
+		uiCPU |= WELS_CPU_SSE3;
+	}
+	if( uiFeatureC & 0x00000200 ){
+		/* SSSE3 support here */
+		uiCPU |= WELS_CPU_SSSE3;
+	}
+	if( uiFeatureC & 0x00080000 ){
+		/* SSE4.1 support here, 45nm Penryn processor */
+		uiCPU |= WELS_CPU_SSE41; 
+	}
+	if( uiFeatureC & 0x00100000 ){
+		/* SSE4.2 support here, next generation Nehalem processor */
+		uiCPU |= WELS_CPU_SSE42;
+	}
+	if ( WelsCPUSupportAVX( uiFeatureA, uiFeatureC ) )	// 
+	{
+		/* AVX supported */
+		uiCPU |= WELS_CPU_AVX;
+	}
+	if ( WelsCPUSupportFMA( uiFeatureA, uiFeatureC ) )	// 
+	{
+		/* AVX FMA supported */
+		uiCPU |= WELS_CPU_FMA;
+	}
+	if ( uiFeatureC & 0x02000000 )
+	{
+		/* AES checking */
+		uiCPU |= WELS_CPU_AES;
+	}
+	if ( uiFeatureC & 0x00400000 )
+	{
+		/* MOVBE checking */
+		uiCPU |= WELS_CPU_MOVBE;
+	}
+
+	if ( pNumberOfLogicProcessors != NULL )
+	{
+		// HTT enabled on chip
+		*pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX		
+	}	
+	
+    WelsCPUId( 0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+
+	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_AMD)) && (uiFeatureA>=0x80000001) ){	// confirmed_safe_unsafe_usage
+		WelsCPUId(0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+		if( uiFeatureD&0x00400000 ){
+			uiCPU |= WELS_CPU_MMXEXT;
+		}
+		if( uiFeatureD&0x80000000 ){
+			uiCPU |= WELS_CPU_3DNOW;
+		}
+	}
+
+	if( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) ){	// confirmed_safe_unsafe_usage
+		int32_t  family, model;
+
+		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+		family = ((uiFeatureA>>8)&0xf) + ((uiFeatureA>>20)&0xff);
+        model  = ((uiFeatureA>>4)&0xf) + ((uiFeatureA>>12)&0xf0);
+
+		if( (family==6) && (model==9 || model==13 || model==14) ){
+			uiCPU &= ~(WELS_CPU_SSE2|WELS_CPU_SSE3);
+		}
+	}
+
+	// get cache line size
+	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_INTEL)) || !(strcmp((const str_t*)chVenderName,CPU_Vender_CYRIX)) ){	// confirmed_safe_unsafe_usage
+		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+		CacheLineSize = (uiFeatureB&0xff00)>>5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+		if( CacheLineSize == 128 ){
+			uiCPU |= WELS_CPU_CACHELINE_128;
+		}
+		else if( CacheLineSize == 64 ){
+			uiCPU |= WELS_CPU_CACHELINE_64;
+		}
+		else if( CacheLineSize == 32 ){
+			uiCPU |= WELS_CPU_CACHELINE_32;
+		}
+		else if( CacheLineSize == 16 ){
+			uiCPU |= WELS_CPU_CACHELINE_16;
+		}
+	}
+	
+    return uiCPU;
+}
+
+
+void WelsCPURestore( const uint32_t kuiCPU )
+{
+    if( kuiCPU & (WELS_CPU_MMX|WELS_CPU_MMXEXT|WELS_CPU_3DNOW|WELS_CPU_3DNOWEXT) )
+    {
+        WelsEmms();
+    }
+}
+
+#endif
+	
+}
--- /dev/null
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -1,0 +1,1091 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	deblocking.c
+ *
+ * \brief	Interfaces introduced in frame deblocking filtering
+ *
+ * \date	08/03/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "as264_common.h"
+#include "deblocking.h"
+#include "cpu_core.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+
+#define g_kuiAlphaTable(x) g_kuiAlphaTable[(x)]
+#define g_kiBetaTable(x)  g_kiBetaTable[(x)]
+#define g_kiTc0Table(x)   g_kiTc0Table[(x)]
+
+#define MB_BS_MV(sCurMv, sNeighMv, uiBIdx, uiBnIdx) \
+	(\
+	( WELS_ABS( sCurMv[uiBIdx].iMvX - sNeighMv[uiBnIdx].iMvX ) >= 4 ) ||\
+	( WELS_ABS( sCurMv[uiBIdx].iMvY - sNeighMv[uiBnIdx].iMvY ) >= 4 )\
+	)
+
+#define SMB_EDGE_MV(uiRefIndex, sMotionVector, uiBIdx, uiBnIdx) \
+	(\
+	!!((WELS_ABS(sMotionVector[uiBIdx].iMvX - sMotionVector[uiBnIdx].iMvX) &(~3)) | (WELS_ABS(sMotionVector[uiBIdx].iMvY - sMotionVector[uiBnIdx].iMvY) &(~3)))\
+	)
+
+#define BS_EDGE(bsx1, uiRefIndex, sMotionVector, uiBIdx, uiBnIdx) \
+	( (bsx1|SMB_EDGE_MV(uiRefIndex, sMotionVector, uiBIdx, uiBnIdx))<<(bsx1?1:0))
+
+#define GET_ALPHA_BETA_FROM_QP(QP, iAlphaOffset, iBetaOffset, iIdexA, iAlpha, iBeta) \
+{\
+	iIdexA = (QP + iAlphaOffset);\
+	iIdexA = CLIP3_QP_0_51(iIdexA);\
+	iAlpha = g_kuiAlphaTable(iIdexA);\
+	iBeta  = g_kiBetaTable((CLIP3_QP_0_51(QP + iBetaOffset)));\
+}
+
+static const uint8_t g_kuiAlphaTable[52+12] = { //this table refers to Table 8-16 in H.264/AVC standard
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
+	7,  8,  9, 10, 12, 13, 15, 17, 20, 22,
+	25, 28, 32, 36, 40, 45, 50, 56, 63, 71,
+	80, 90,101,113,127,144,162,182,203,226,
+	255, 255
+	,255, 255,255, 255,255, 255,255, 255,255, 255,255, 255
+};
+
+static const int8_t g_kiBetaTable[52+12] = { //this table refers to Table 8-16 in H.264/AVC standard
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
+	3,  3,  3,  4,  4,  4,  6,  6,  7,  7,
+	8,  8,  9,  9, 10, 10, 11, 11, 12, 12,
+	13, 13, 14, 14, 15, 15, 16, 16, 17, 17,
+	18, 18
+	,18, 18,18, 18,18, 18,18, 18,18, 18,18, 18
+};
+
+static const int8_t g_kiTc0Table[52+12][4] = { //this table refers Table 8-17 in H.264/AVC standard
+	{ -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+	{ -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 },
+	{ -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 0 }, { -1, 0, 0, 1 },
+	{ -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 0, 1 }, { -1, 0, 1, 1 }, { -1, 0, 1, 1 }, { -1, 1, 1, 1 },
+	{ -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 1 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 }, { -1, 1, 1, 2 },
+	{ -1, 1, 1, 2 }, { -1, 1, 2, 3 }, { -1, 1, 2, 3 }, { -1, 2, 2, 3 }, { -1, 2, 2, 4 }, { -1, 2, 3, 4 },
+	{ -1, 2, 3, 4 }, { -1, 3, 3, 5 }, { -1, 3, 4, 6 }, { -1, 3, 4, 6 }, { -1, 4, 5, 7 }, { -1, 4, 5, 8 },
+	{ -1, 4, 6, 9 }, { -1, 5, 7,10 }, { -1, 6, 8,11 }, { -1, 6, 8,13 }, { -1, 7,10,14 }, { -1, 8,11,16 },
+	{ -1, 9,12,18 }, { -1, 10,13,20 }, {-1,11,15,23 }, { -1,13,17,25 }
+	,{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 }
+	,{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 },{ -1,13,17,25 }
+};
+
+static const uint8_t g_kuiTableBIdx[2][8] =   
+{     
+	{0,  4,  8,  12, // g_kuiTableBIdx
+	3,  7,  11, 15}, // table_bn_idx
+
+	{0,  1,  2,  3 , // g_kuiTableBIdx
+	12, 13, 14, 15}, // table_bn_idx
+};
+
+static const ALIGNED_DECLARE(int32_t,g_kiTableBlock8x8Idx[2][4][4],16) =   
+{ 
+	{0, 0, 2, 2,
+	 0, 0, 2, 2,
+	 1, 1, 3, 3,
+	 1, 1, 3, 3},
+	
+	{0, 0, 1, 1,
+	 0, 0, 1, 1,
+	 2, 2, 3, 3,
+	 2, 2, 3, 3}
+};
+static const ALIGNED_DECLARE(int32_t,g_kiTableBlock8x8NIdx[2][4][4],16) = 	
+{  
+	{1, 1, 3, 3,
+	 0, 0, 2, 2,
+	 0, 0, 2, 2,
+	 1, 1, 3, 3},
+
+	 {2, 2, 3, 3,
+	  0, 0, 1, 1,
+	  0, 0, 1, 1,
+	  2, 2, 3, 3}
+};
+
+#define TC0_TBL_LOOKUP(iTc, iIdexA, pBS, bchroma) \
+{\
+	iTc[0] = g_kiTc0Table(iIdexA)[pBS[0]] + bchroma;\
+	iTc[1] = g_kiTc0Table(iIdexA)[pBS[1]] + bchroma;\
+	iTc[2] = g_kiTc0Table(iIdexA)[pBS[2]] + bchroma;\
+	iTc[3] = g_kiTc0Table(iIdexA)[pBS[3]] + bchroma;\
+}
+
+void inline DeblockingBSInsideMBAvsbase( int8_t* pNnzTab, uint8_t uiBS[2][4][4], int32_t iLShiftFactor )
+{
+	uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
+	ENFORCE_STACK_ALIGN_1D( uint8_t, uiBsx3, 4, 4 );
+
+	uiNnz32b0 = *(uint32_t *)(pNnzTab+0);
+	uiNnz32b1 = *(uint32_t *)(pNnzTab+4);
+	uiNnz32b2 = *(uint32_t *)(pNnzTab+8);
+	uiNnz32b3 = *(uint32_t *)(pNnzTab+12);
+
+	*(uint32_t *)uiBsx3 = (uiNnz32b0|(uiNnz32b0>>8))<<iLShiftFactor;
+	uiBS[0][1][0] = uiBsx3[0];
+	uiBS[0][2][0] = uiBsx3[1];
+	uiBS[0][3][0] = uiBsx3[2];
+
+	*(uint32_t *)uiBsx3 = (uiNnz32b1|(uiNnz32b1>>8))<<iLShiftFactor;
+	uiBS[0][1][1] = uiBsx3[0];
+	uiBS[0][2][1] = uiBsx3[1];
+	uiBS[0][3][1] = uiBsx3[2];
+	*(uint32_t *)uiBS[1][1] = (uiNnz32b0|uiNnz32b1)<<iLShiftFactor;
+
+	*(uint32_t *)uiBsx3 = (uiNnz32b2|(uiNnz32b2>>8))<<iLShiftFactor;
+	uiBS[0][1][2] = uiBsx3[0];
+	uiBS[0][2][2] = uiBsx3[1];
+	uiBS[0][3][2] = uiBsx3[2];
+	*(uint32_t *)uiBS[1][2] = (uiNnz32b1|uiNnz32b2)<<iLShiftFactor;
+
+	*(uint32_t *)uiBsx3 = (uiNnz32b3|(uiNnz32b3>>8))<<iLShiftFactor;
+	uiBS[0][1][3] = uiBsx3[0];
+	uiBS[0][2][3] = uiBsx3[1];
+	uiBS[0][3][3] = uiBsx3[2];	
+	*(uint32_t *)uiBS[1][3] = (uiNnz32b2|uiNnz32b3)<<iLShiftFactor;
+
+}
+
+void inline DeblockingBSInsideMBNormal( SMB* pCurMb, uint8_t uiBS[2][4][4], int8_t* pNnzTab )
+{
+	uint32_t uiNnz32b0, uiNnz32b1, uiNnz32b2, uiNnz32b3;
+	ENFORCE_STACK_ALIGN_1D( uint8_t, uiBsx4, 4, 4 );
+
+	uiNnz32b0 = *(uint32_t *)(pNnzTab+0);
+	uiNnz32b1 = *(uint32_t *)(pNnzTab+4);
+	uiNnz32b2 = *(uint32_t *)(pNnzTab+8);
+	uiNnz32b3 = *(uint32_t *)(pNnzTab+12);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b0|(uiNnz32b0>>8));
+	uiBS[0][1][0] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 1, 0);
+	uiBS[0][2][0] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 2, 1);
+	uiBS[0][3][0] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 3, 2); 
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b1|(uiNnz32b1>>8));
+	uiBS[0][1][1] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 5, 4);
+	uiBS[0][2][1] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 6, 5);
+	uiBS[0][3][1] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 7, 6);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b2|(uiNnz32b2>>8));
+	uiBS[0][1][2] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 9, 8);
+	uiBS[0][2][2] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 10,9);
+	uiBS[0][3][2] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 11,10);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b3|(uiNnz32b3>>8));
+	uiBS[0][1][3] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 13,12);
+	uiBS[0][2][3] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 14,13);
+	uiBS[0][3][3] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 15,14);	
+
+	//horizontal
+	*(uint32_t *)uiBsx4 = (uiNnz32b0|uiNnz32b1);
+	uiBS[1][1][0] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 4, 0);
+	uiBS[1][1][1] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 5, 1);
+	uiBS[1][1][2] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 6, 2);
+	uiBS[1][1][3] = BS_EDGE(uiBsx4[3], iRefIdx, pCurMb->sMv, 7, 3); 
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b1|uiNnz32b2);
+	uiBS[1][2][0] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 8, 4);
+	uiBS[1][2][1] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 9, 5);
+	uiBS[1][2][2] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 10, 6);
+	uiBS[1][2][3] = BS_EDGE(uiBsx4[3], iRefIdx, pCurMb->sMv, 11, 7);
+
+	*(uint32_t *)uiBsx4 = (uiNnz32b2|uiNnz32b3);
+	uiBS[1][3][0] = BS_EDGE(uiBsx4[0], iRefIdx, pCurMb->sMv, 12, 8);
+	uiBS[1][3][1] = BS_EDGE(uiBsx4[1], iRefIdx, pCurMb->sMv, 13, 9);
+	uiBS[1][3][2] = BS_EDGE(uiBsx4[2], iRefIdx, pCurMb->sMv, 14, 10);
+	uiBS[1][3][3] = BS_EDGE(uiBsx4[3], iRefIdx, pCurMb->sMv, 15, 11);
+}
+
+uint32_t DeblockingBSMarginalMBAvcbase( SMB* pCurMb, SMB* pNeighMb, int32_t iEdge)
+{
+	int32_t i;
+	uint32_t uiBSx4;    
+	uint8_t* pBS = (uint8_t*)(&uiBSx4);
+	uint32_t uiBIdx  = *(uint32_t *)(&g_kuiTableBIdx[iEdge][0]); 
+	uint32_t uiBnIdx = *(uint32_t *)(&g_kuiTableBIdx[iEdge][4]);
+
+	for( i = 0; i < 4; i++ )
+	{
+		if (pCurMb->pNonZeroCount[uiBIdx&0xff] | pNeighMb->pNonZeroCount[uiBnIdx&0xff])		
+		{
+			pBS[i] = 2;
+		} 
+		else 
+		{
+			pBS[i] = 
+#ifndef SINGLE_REF_FRAME
+			(pCurMb->uiRefIndex[g_kiTableBlock8x8Idx[1][iEdge][i]] - pNeighMb->uiRefIndex[g_kiTableBlock8x8NIdx[1][iEdge][i]]) ||
+#endif
+			MB_BS_MV(pCurMb->sMv, pNeighMb->sMv, (uiBIdx&0xff), (uiBnIdx&0xff));				
+		}
+		uiBIdx  = uiBIdx  >> 8;
+		uiBnIdx = uiBnIdx >> 8;
+	}
+	return uiBSx4;
+}
+
+void FilteringEdgeLumaH( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
+{
+	int32_t iIdexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+	ENFORCE_STACK_ALIGN_1D( int8_t, iTc, 4, 16 );
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(iTc, iIdexA, pBS, 0);
+		pfDeblocking->pfLumaDeblockingLT4Ver(pPix, iStride, iAlpha, iBeta, iTc);
+	}
+	return;
+}
+void FilteringEdgeLumaV( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride, uint8_t* pBS )
+{
+	int32_t  iIdexA;
+	int32_t  iAlpha;
+	int32_t  iBeta; 
+	ENFORCE_STACK_ALIGN_1D( int8_t, iTc, 4, 16 );
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(iTc, iIdexA, pBS, 0);
+		pfDeblocking->pfLumaDeblockingLT4Hor(pPix, iStride, iAlpha, iBeta, iTc);
+	}
+	return;
+}
+
+void FilteringEdgeLumaIntraH( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride,uint8_t* pBS )
+{
+	int32_t iIdexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  	
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		pfDeblocking->pfLumaDeblockingEQ4Ver(pPix, iStride, iAlpha, iBeta);
+	}
+	return;
+}
+
+void FilteringEdgeLumaIntraV( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPix, int32_t iStride,uint8_t* pBS)
+{
+	int32_t iIdexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{	
+		pfDeblocking->pfLumaDeblockingEQ4Hor(pPix, iStride, iAlpha, iBeta);
+	}
+	return;
+}
+void FilteringEdgeChromaH( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
+{	
+	int32_t iIdexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+	ENFORCE_STACK_ALIGN_1D( int8_t, iTc, 4, 16 );
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(iTc, iIdexA, pBS, 1);
+		pfDeblocking->pfChromaDeblockingLT4Ver(pPixCb, pPixCr, iStride,iAlpha, iBeta, iTc);
+	}
+	return;
+} 
+void FilteringEdgeChromaV( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
+{	  
+	int32_t iIdexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+	ENFORCE_STACK_ALIGN_1D( int8_t, iTc, 4, 16 );
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(iTc, iIdexA, pBS, 1);
+		pfDeblocking->pfChromaDeblockingLT4Hor(pPixCb, pPixCr, iStride, iAlpha, iBeta, iTc);
+	}
+	return;
+}
+
+void FilteringEdgeChromaIntraH( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
+{
+	int32_t iIdexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		pfDeblocking->pfChromaDeblockingEQ4Ver(pPixCb, pPixCr, iStride, iAlpha, iBeta);
+	}
+	return;
+}
+
+void FilteringEdgeChromaIntraV( DeblockingFunc* pfDeblocking, SDeblockingFilter* pFilter, uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, uint8_t* pBS )
+{
+	int32_t iIdexA; 
+	int32_t iAlpha; 
+	int32_t iBeta;  
+
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+
+	if( iAlpha | iBeta )
+	{
+		pfDeblocking->pfChromaDeblockinEQ4Hor(pPixCb, pPixCr, iStride, iAlpha, iBeta);
+	}
+	return;
+}
+
+void DeblockingInterMb( DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter, uint8_t uiBS[2][4][4] )
+{
+	int8_t iCurLumaQp   = pCurMb->uiLumaQp;
+	int8_t iCurChromaQp = pCurMb->uiChromaQp;
+	int32_t iLineSize     = pFilter->iCsStride[0];
+	int32_t iLineSizeUV   = pFilter->iCsStride[1];
+	int32_t iMbStride    = pFilter->iMbStride;
+
+	int32_t iMbX = pCurMb->iMbX;
+	int32_t iMbY = pCurMb->iMbY;
+
+	BOOL_T bLeftBsValid[2] = {(iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
+	BOOL_T bTopBsValid[2]  = {(iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
+
+	int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc]; 
+	int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
+
+	uint8_t *pDestY, *pDestCb, *pDestCr;
+	pDestY  = pFilter->pCsData[0];			
+	pDestCb = pFilter->pCsData[1];				
+	pDestCr = pFilter->pCsData[2]; 
+
+	if (iLeftFlag)	
+	{
+		pFilter->uiLumaQP   = (iCurLumaQp + (pCurMb-1)->uiLumaQp + 1) >> 1;
+		pFilter->uiChromaQP = (iCurChromaQp + (pCurMb-1)->uiChromaQp+ 1) >> 1;
+
+		if( uiBS[0][0][0] == 0x04 )
+		{
+			FilteringEdgeLumaIntraV( pfDeblocking, pFilter, pDestY, iLineSize ,NULL);
+			FilteringEdgeChromaIntraV( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, NULL );
+		} 
+		else
+		{
+			if(*(uint32_t *)uiBS[0][0] != 0)
+			{
+				FilteringEdgeLumaV( pfDeblocking, pFilter, pDestY, iLineSize, uiBS[0][0] );
+				FilteringEdgeChromaV( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, uiBS[0][0] );
+			}
+		}
+	}
+
+	pFilter->uiLumaQP = iCurLumaQp;
+	pFilter->uiChromaQP = iCurChromaQp;
+
+	if(*(uint32_t *)uiBS[0][1] != 0)
+	{
+		FilteringEdgeLumaV( pfDeblocking, pFilter, &pDestY[1<<2], iLineSize, uiBS[0][1]);
+	}
+
+	if(*(uint32_t *)uiBS[0][2] != 0)
+	{
+		FilteringEdgeLumaV( pfDeblocking, pFilter, &pDestY[2<<2], iLineSize, uiBS[0][2]);
+		FilteringEdgeChromaV( pfDeblocking, pFilter, &pDestCb[2<<1], &pDestCr[2<<1], iLineSizeUV, uiBS[0][2] );
+	}
+
+	if(*(uint32_t *)uiBS[0][3] != 0)
+	{
+		FilteringEdgeLumaV( pfDeblocking, pFilter, &pDestY[3<<2], iLineSize, uiBS[0][3] );
+	}
+
+	if (iTopFlag)	
+	{	
+		pFilter->uiLumaQP = (iCurLumaQp + (pCurMb-iMbStride)->uiLumaQp + 1) >> 1;
+		pFilter->uiChromaQP = (iCurChromaQp + (pCurMb-iMbStride)->uiChromaQp + 1) >> 1;
+
+		if(uiBS[1][0][0] == 0x04)
+		{
+			FilteringEdgeLumaIntraH( pfDeblocking, pFilter, pDestY, iLineSize ,NULL);
+			FilteringEdgeChromaIntraH( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, NULL );
+		} 
+		else 
+		{
+			if(*(uint32_t *)uiBS[1][0] != 0)
+			{
+				FilteringEdgeLumaH( pfDeblocking, pFilter, pDestY, iLineSize, uiBS[1][0] );
+				FilteringEdgeChromaH( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSizeUV, uiBS[1][0] );
+			}
+		}  
+	}
+
+	pFilter->uiLumaQP = iCurLumaQp;
+	pFilter->uiChromaQP = iCurChromaQp;
+
+	if(*(uint32_t *)uiBS[1][1] != 0)
+	{
+		FilteringEdgeLumaH( pfDeblocking, pFilter, &pDestY[(1<<2)*iLineSize], iLineSize, uiBS[1][1] );
+	}
+
+	if(*(uint32_t *)uiBS[1][2] != 0)
+	{
+		FilteringEdgeLumaH( pfDeblocking, pFilter, &pDestY[(2<<2)*iLineSize], iLineSize, uiBS[1][2] );
+		FilteringEdgeChromaH( pfDeblocking, pFilter, &pDestCb[(2<<1)*iLineSizeUV], &pDestCr[(2<<1)*iLineSizeUV], iLineSizeUV, uiBS[1][2] );
+	}
+
+	if(*(uint32_t *)uiBS[1][3] != 0)
+	{
+		FilteringEdgeLumaH( pfDeblocking, pFilter, &pDestY[(3<<2)*iLineSize], iLineSize, uiBS[1][3] );
+	}
+}
+
+void FilteringEdgeLumaHV( DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter )
+{
+	int32_t iLineSize  = pFilter->iCsStride[0];
+	int32_t iMbStride = pFilter->iMbStride;
+
+	uint8_t  *pDestY;	
+	int8_t   iCurQp;
+	int32_t  iIdexA, iAlpha, iBeta;
+
+	int32_t iMbX = pCurMb->iMbX;
+	int32_t iMbY = pCurMb->iMbY;
+
+	BOOL_T bLeftBsValid[2] = {(iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
+	BOOL_T bTopBsValid[2]  = {(iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
+
+	int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc]; 
+	int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
+
+	ENFORCE_STACK_ALIGN_1D(int8_t,  iTc,   4, 16 );
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiBSx4, 4, 4  );
+
+	pDestY  = pFilter->pCsData[0];
+	iCurQp  = pCurMb->uiLumaQp;
+
+	*(uint32_t*)uiBSx4 = 0x03030303;
+
+	// luma v
+	if (iLeftFlag)	
+	{
+		pFilter->uiLumaQP = ( iCurQp + (pCurMb-1)->uiLumaQp + 1 ) >> 1;		
+		FilteringEdgeLumaIntraV( pfDeblocking, pFilter, pDestY, iLineSize,NULL );
+	}
+
+	pFilter->uiLumaQP   = iCurQp;	
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiLumaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(iTc, iIdexA, uiBSx4, 0);
+		pfDeblocking->pfLumaDeblockingLT4Hor( &pDestY[1 << 2], iLineSize, iAlpha, iBeta, iTc );
+		pfDeblocking->pfLumaDeblockingLT4Hor( &pDestY[2 << 2], iLineSize, iAlpha, iBeta, iTc );
+		pfDeblocking->pfLumaDeblockingLT4Hor( &pDestY[3 << 2], iLineSize, iAlpha, iBeta, iTc );
+
+	}
+
+	// luma h
+	if (iTopFlag)	
+	{
+		pFilter->uiLumaQP   = ( iCurQp   + (pCurMb-iMbStride)->uiLumaQp + 1 ) >> 1;	
+		FilteringEdgeLumaIntraH( pfDeblocking, pFilter, pDestY, iLineSize,NULL );
+	}   
+
+	pFilter->uiLumaQP   = iCurQp;	
+	if( iAlpha | iBeta )
+	{
+		pfDeblocking->pfLumaDeblockingLT4Ver( &pDestY[(1<<2)*iLineSize],iLineSize,iAlpha, iBeta,iTc );
+		pfDeblocking->pfLumaDeblockingLT4Ver( &pDestY[(2<<2)*iLineSize],iLineSize,iAlpha, iBeta,iTc );
+		pfDeblocking->pfLumaDeblockingLT4Ver( &pDestY[(3<<2)*iLineSize],iLineSize,iAlpha, iBeta,iTc );
+	}
+}
+void FilteringEdgeChromaHV( DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter )
+{
+	int32_t iLineSize  = pFilter->iCsStride[1];
+	int32_t iMbStride = pFilter->iMbStride;
+
+	uint8_t  *pDestCb, *pDestCr;	
+	int8_t   iCurQp;
+	int32_t  iIdexA, iAlpha, iBeta;
+
+	int32_t iMbX = pCurMb->iMbX;
+	int32_t iMbY = pCurMb->iMbY;
+
+	BOOL_T bLeftBsValid[2] = {(iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
+	BOOL_T bTopBsValid[2]  = {(iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
+
+	int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc]; 
+	int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
+
+	ENFORCE_STACK_ALIGN_1D( int8_t,  iTc,   4, 16 );
+	ENFORCE_STACK_ALIGN_1D( uint8_t, uiBSx4, 4, 4  );
+
+	pDestCb = pFilter->pCsData[1];				
+	pDestCr = pFilter->pCsData[2];	
+	iCurQp  = pCurMb->uiChromaQp;
+	*(uint32_t*)uiBSx4 = 0x03030303;
+
+	// chroma v
+	if (iLeftFlag)		
+	{
+		pFilter->uiChromaQP = ( iCurQp + (pCurMb-1)->uiChromaQp + 1 ) >> 1;	
+		FilteringEdgeChromaIntraV( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSize, NULL);
+	}
+
+	pFilter->uiChromaQP   = iCurQp;	
+	GET_ALPHA_BETA_FROM_QP(pFilter->uiChromaQP, pFilter->iSliceAlphaC0Offset, pFilter->iSliceBetaOffset, iIdexA, iAlpha, iBeta);
+	if( iAlpha | iBeta )
+	{
+		TC0_TBL_LOOKUP(iTc, iIdexA, uiBSx4, 1);
+		pfDeblocking->pfChromaDeblockingLT4Hor( &pDestCb[2<<1], &pDestCr[2<<1], iLineSize, iAlpha, iBeta, iTc );
+	}
+
+	// chroma h
+	if (iTopFlag)	
+	{
+		pFilter->uiChromaQP = ( iCurQp + (pCurMb-iMbStride)->uiChromaQp + 1 ) >> 1;		
+		FilteringEdgeChromaIntraH( pfDeblocking, pFilter, pDestCb, pDestCr, iLineSize, NULL);
+	}   
+
+	pFilter->uiChromaQP   = iCurQp;	
+	if( iAlpha | iBeta )
+	{
+		pfDeblocking->pfChromaDeblockingLT4Ver( &pDestCb[(2<<1)*iLineSize], &pDestCr[(2<<1)*iLineSize], iLineSize, iAlpha, iBeta, iTc );
+	}
+}
+
+// merge h&v lookup table operation to save performance
+void DeblockingIntraMb( DeblockingFunc* pfDeblocking, SMB* pCurMb, SDeblockingFilter* pFilter )
+{
+	FilteringEdgeLumaHV(  pfDeblocking, pCurMb, pFilter);
+	FilteringEdgeChromaHV(pfDeblocking, pCurMb, pFilter);
+}
+
+void DeblockingMbAvcbase( SWelsFuncPtrList* pFunc, SMB* pCurMb, SDeblockingFilter * pFilter )
+{
+	uint8_t uiBS[2][4][4] = { 0 };
+
+	Mb_Type uiCurMbType = pCurMb->uiMbType;
+	int32_t iMbStride  = pFilter->iMbStride;
+
+	int32_t iMbX = pCurMb->iMbX;
+	int32_t iMbY = pCurMb->iMbY;
+
+	BOOL_T bLeftBsValid[2] = {(iMbX > 0), ((iMbX > 0) && (pCurMb->uiSliceIdc == (pCurMb - 1)->uiSliceIdc))};
+	BOOL_T bTopBsValid[2]  = {(iMbY > 0), ((iMbY > 0) && (pCurMb->uiSliceIdc == (pCurMb - iMbStride)->uiSliceIdc))};
+
+	int32_t iLeftFlag = bLeftBsValid[pFilter->uiFilterIdc]; 
+	int32_t iTopFlag  = bTopBsValid[pFilter->uiFilterIdc];
+
+	switch( uiCurMbType )
+	{
+	case MB_TYPE_INTRA4x4:
+	case MB_TYPE_INTRA16x16:
+	case MB_TYPE_INTRA_PCM:
+		DeblockingIntraMb( &pFunc->pfDeblocking, pCurMb, pFilter );
+		break;
+	default:
+		if (iLeftFlag)		
+		{
+			*(uint32_t*)uiBS[0][0] = IS_INTRA((pCurMb-1)->uiMbType)?0x04040404:DeblockingBSMarginalMBAvcbase( pCurMb, pCurMb-1, 0 );
+		}
+		else
+		{
+			*(uint32_t*)uiBS[0][0] = 0;
+		}
+		if (iTopFlag)		
+		{
+			*(uint32_t*)uiBS[1][0] = IS_INTRA((pCurMb-iMbStride)->uiMbType)?0x04040404:DeblockingBSMarginalMBAvcbase( pCurMb, (pCurMb-iMbStride), 1 );
+		}
+		else
+		{
+			*(uint32_t*)uiBS[1][0] = 0;
+		}
+		//SKIP MB_16x16 or others
+		if( uiCurMbType != MB_TYPE_SKIP )
+		{
+			pFunc->pfSetNZCZero(pCurMb->pNonZeroCount);// set all none-zero nzc to 1; dbk can be opti!
+
+			if( uiCurMbType == MB_TYPE_16x16 )
+			{
+				DeblockingBSInsideMBAvsbase( pCurMb->pNonZeroCount, uiBS, 1 );
+			} 
+			else 
+			{
+				DeblockingBSInsideMBNormal(pCurMb, uiBS, pCurMb->pNonZeroCount);
+			}
+		}
+		else
+		{
+			*(uint32_t*)uiBS[0][1] = *(uint32_t*)uiBS[0][2] = *(uint32_t*)uiBS[0][3] = 
+				*(uint32_t*)uiBS[1][1] = *(uint32_t*)uiBS[1][2] = *(uint32_t*)uiBS[1][3] = 0;
+		}
+
+		DeblockingInterMb( &pFunc->pfDeblocking, pCurMb, pFilter, uiBS );
+		break;
+	}
+}
+
+//  C code only
+
+void DeblockLumaLt4_c( uint8_t *pPix, int32_t iStrideX,int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc )
+{
+	for( int32_t i = 0;i<16;i++)
+	{
+		int32_t iTc0 = pTc[i>>2];
+		if(iTc0>=0)
+		{
+				int32_t p0 = pPix[-iStrideX];	
+				int32_t p1 = pPix[-2*iStrideX];	
+				int32_t p2 = pPix[-3*iStrideX];	
+				int32_t q0 = pPix[0];	
+				int32_t q1 = pPix[iStrideX];	
+				int32_t q2 = pPix[2*iStrideX];	
+				bool_t bDetaP0Q0= WELS_ABS( p0 - q0 )<iAlpha;
+				bool_t bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
+				bool_t bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+				int32_t iTc = iTc0;
+				if ( bDetaP0Q0&& bDetaP1P0 && bDetaQ1Q0 )
+				{	
+					bool_t bDetaP2P0 =  WELS_ABS( p2 - p0 ) < iBeta;
+					bool_t bDetaQ2Q0 =  WELS_ABS( q2 - q0 ) < iBeta;
+					if ( bDetaP2P0) 
+					{
+						pPix[-2*iStrideX] = p1 + WELS_CLIP3( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -iTc0, iTc0 );
+						iTc++;
+					}
+					if (bDetaQ2Q0)
+					{
+						pPix[iStrideX] = q1 + WELS_CLIP3( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -iTc0, iTc0 );
+						iTc++;
+					}
+					int32_t iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc );
+					pPix[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
+					pPix[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
+			}
+		}
+		pPix +=iStrideY;
+	}
+}
+
+
+void DeblockLumaEq4_c( uint8_t *pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta )
+{
+	int32_t p0,p1,p2,q0,q1,q2;
+	int32_t iDetaP0Q0;
+	bool_t bDetaP1P0,bDetaQ1Q0;
+	for (int32_t i = 0;i<16;i++)
+	{
+		p0 = pPix[-iStrideX];
+		p1 = pPix[-2*iStrideX];
+		p2 = pPix[-3*iStrideX];							
+		q0 = pPix[0];
+		q1 = pPix[iStrideX];
+		q2 = pPix[2*iStrideX];
+		iDetaP0Q0 = WELS_ABS( p0 - q0 );
+		bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
+		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+		if ((iDetaP0Q0<iAlpha) && bDetaP1P0 &&bDetaQ1Q0)
+		{
+			if (iDetaP0Q0< (( iAlpha >> 2 ) + 2 ) )
+			{
+				bool_t bDetaP2P0 = WELS_ABS( p2 - p0 ) < iBeta;
+				bool_t bDetaQ2Q0 =  WELS_ABS( q2 - q0 ) < iBeta;
+				if(bDetaP2P0)
+				{	
+					const int32_t p3 = pPix[-4*iStrideX];	
+					pPix[-iStrideX] = ( p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4 ) >> 3;	 //p0
+					pPix[-2*iStrideX] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;	 //p1
+					pPix[-3*iStrideX] = ( (p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4 ) >> 3;//p2
+				 } 
+				 else 
+				 {
+					pPix[-1*iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;	//p0
+			     }	
+				 if (bDetaQ2Q0)	
+				 {	
+					const int32_t q3 = pPix[3*iStrideX];		
+					pPix[0] = ( p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4 ) >> 3; //q0
+					pPix[iStrideX] = ( p0 + q0 + q1 + q2 + 2 ) >> 2; //q1
+					pPix[2*iStrideX] = ( (q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4 ) >> 3;//q2
+				  } 
+				  else 
+				  {	
+					pPix[0] = ( (q1 << 1) + q0 + p1 + 2 ) >> 2; //q0
+				  }
+			}
+			else
+			{
+			 	pPix[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2; //p0
+				pPix[ 0] = ( (q1 << 1) + q0 + p1 + 2 ) >> 2; //q0
+			}
+		}
+	 pPix += iStrideY;
+	} 
+}
+void DeblockLumaLt4V_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc )
+{
+	DeblockLumaLt4_c( pPix, iStride, 1, iAlpha, iBeta, iTc );
+}
+void DeblockLumaLt4H_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc )
+{
+	DeblockLumaLt4_c( pPix, 1, iStride, iAlpha, iBeta, iTc );
+}
+void DeblockLumaEq4V_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta )
+{
+	DeblockLumaEq4_c( pPix, iStride, 1, iAlpha, iBeta);
+}
+void DeblockLumaEq4H_c( uint8_t *pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta )
+{
+	DeblockLumaEq4_c( pPix, 1, iStride, iAlpha, iBeta );
+}
+void DeblockChromaLt4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta, int8_t *pTc )
+{
+	int32_t p0, p1, q0, q1,iDeta;
+	bool_t bDetaP0Q0,bDetaP1P0,bDetaQ1Q0;
+
+	for(int32_t i = 0;i<8;i++)
+	{
+		int32_t iTc0 = pTc[i>>1];
+		if(iTc0 >0)
+		{
+		p0 = pPixCb[-iStrideX];
+		p1 = pPixCb[-2*iStrideX];
+		q0 = pPixCb[0];
+		q1 = pPixCb[iStrideX];		
+
+		bDetaP0Q0 =  WELS_ABS( p0 - q0 ) < iAlpha;
+		bDetaP1P0 =  WELS_ABS( p1 - p0 ) < iBeta;
+		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+		if( bDetaP0Q0&&bDetaP1P0 &&	bDetaQ1Q0) 
+		{
+			iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0 );
+			pPixCb[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
+			pPixCb[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
+		}
+	
+
+		p0 = pPixCr[-iStrideX];
+		p1 = pPixCr[-2*iStrideX];
+		q0 = pPixCr[0];
+		q1 = pPixCr[iStrideX];	
+
+		bDetaP0Q0 =  WELS_ABS( p0 - q0 ) < iAlpha;
+		bDetaP1P0 =  WELS_ABS( p1 - p0 ) < iBeta;
+		bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+
+		if( bDetaP0Q0&&bDetaP1P0 &&	bDetaQ1Q0) 
+		{
+			iDeta = WELS_CLIP3( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0 );
+			pPixCr[-iStrideX] = WELS_CLIP1( p0 + iDeta );    /* p0' */
+			pPixCr[0]  = WELS_CLIP1( q0 - iDeta );    /* q0' */
+		}
+		}
+		pPixCb += iStrideY;
+		pPixCr += iStrideY;
+	}
+}
+void DeblockChromaEq4_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta )
+{
+	int32_t i = 0, d = 0;
+	int32_t p0, p1, q0, q1;
+	bool_t bDetaP0Q0,bDetaP1P0,bDetaQ1Q0;
+	for(int32_t i =0;i<8;i++)
+	{
+		    //cb
+			p0 = pPixCb[-iStrideX];
+			p1 = pPixCb[-2*iStrideX];
+			q0 = pPixCb[0];
+			q1 = pPixCb[iStrideX];
+			bDetaP0Q0 = WELS_ABS( p0 - q0 ) < iAlpha;
+			bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
+			bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+			if(bDetaP0Q0&&bDetaP1P0&&bDetaQ1Q0)
+			{
+				pPixCb[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;   /* p0' */
+				pPixCb[0]  = ( (q1 << 1) + q0 + p1 + 2 ) >> 2;   /* q0' */
+			}
+			
+			//cr
+			p0 = pPixCr[-iStrideX];
+			p1 = pPixCr[-2*iStrideX];
+			q0 = pPixCr[0];
+			q1 = pPixCr[iStrideX];
+			bDetaP0Q0 = WELS_ABS( p0 - q0 ) < iAlpha;
+			bDetaP1P0 = WELS_ABS( p1 - p0 ) < iBeta;
+			bDetaQ1Q0 = WELS_ABS( q1 - q0 ) < iBeta;
+			if(bDetaP0Q0&&bDetaP1P0&&bDetaQ1Q0)
+			{
+				pPixCr[-iStrideX] = ( (p1 << 1) + p0 + q1 + 2 ) >> 2;   /* p0' */
+				pPixCr[0]  = ( (q1 << 1) + q0 + p1 + 2 ) >> 2;   /* q0' */
+			}
+			pPixCr += iStrideY;	
+			pPixCb += iStrideY;	
+	}
+}
+void DeblockChromaLt4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc )
+{
+	DeblockChromaLt4_c( pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, iTc );
+}
+void DeblockChromaLt4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *iTc )
+{
+	DeblockChromaLt4_c( pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, iTc );
+}
+void DeblockChromaEq4V_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta )
+{
+	DeblockChromaEq4_c( pPixCb, pPixCr, iStride, 1, iAlpha, iBeta );
+}
+void DeblockChromaEq4H_c( uint8_t *pPixCb, uint8_t *pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta )
+{
+	DeblockChromaEq4_c( pPixCb, pPixCr, 1, iStride, iAlpha, iBeta );
+}
+
+
+void  DeblockingFilterFrameAvcbase( SDqLayer *pCurDq, SWelsFuncPtrList *pFunc )
+{
+	int32_t i,j;
+	const int32_t kiMbWidth	= pCurDq->iMbWidth;
+	const int32_t kiMbHeight	= pCurDq->iMbHeight;
+	SMB * pCurrentMbBlock	= pCurDq->sMbDataP;	
+	SSliceHeaderExt *sSliceHeaderExt = &pCurDq->sLayerInfo.pSliceInLayer[0].sSliceHeaderExt;
+	SDeblockingFilter pFilter;
+
+	/* Step1: parameters set */	
+	if ( sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc == 1 )
+		return;
+
+	pFilter.uiFilterIdc = (sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc != 0);
+
+	pFilter.iCsStride[0] = pCurDq->pDecPic->iLineSize[0];
+	pFilter.iCsStride[1] = pCurDq->pDecPic->iLineSize[1];
+	pFilter.iCsStride[2] = pCurDq->pDecPic->iLineSize[2];		
+
+	pFilter.iSliceAlphaC0Offset = sSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
+	pFilter.iSliceBetaOffset     = sSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
+
+	pFilter.iMbStride = kiMbWidth;	
+
+	for( j = 0; j < kiMbHeight; ++j ){
+		pFilter.pCsData[0] = pCurDq->pDecPic->pData[0] + ((j*pFilter.iCsStride[0])<<4);
+		pFilter.pCsData[1] = pCurDq->pDecPic->pData[1] + ((j*pFilter.iCsStride[1])<<3);
+		pFilter.pCsData[2] = pCurDq->pDecPic->pData[2] + ((j*pFilter.iCsStride[2])<<3);
+		for( i=0;i<kiMbWidth; i++ ){
+			DeblockingMbAvcbase( pFunc, pCurrentMbBlock, &pFilter );
+			++pCurrentMbBlock;
+			pFilter.pCsData[0] += MB_WIDTH_LUMA;
+			pFilter.pCsData[1] += MB_WIDTH_CHROMA;
+			pFilter.pCsData[2] += MB_WIDTH_CHROMA;
+		}			
+	}
+}
+
+void DeblockingFilterSliceAvcbase( SDqLayer *pCurDq, SWelsFuncPtrList *pFunc, const int32_t kiSliceIdx )
+{	
+	SSliceCtx * pSliceCtx			= pCurDq->pSliceEncCtx;
+	SMB *pMbList							= pCurDq->sMbDataP;
+	SSliceHeaderExt *sSliceHeaderExt	= &pCurDq->sLayerInfo.pSliceInLayer[kiSliceIdx].sSliceHeaderExt;	
+	SMB * pCurrentMbBlock;
+
+	const int32_t kiMbWidth				= pCurDq->iMbWidth;
+	const int32_t kiMbHeight				= pCurDq->iMbHeight;
+	const int32_t kiTotalNumMb			= kiMbWidth * kiMbHeight;
+	int32_t iCurMbIdx = 0, iNextMbIdx = 0, iNumMbFiltered = 0;	
+
+	/* Step1: parameters set */	
+	if ( sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc == 1 )
+		return;
+
+	SDeblockingFilter pFilter;
+
+	pFilter.uiFilterIdc = (sSliceHeaderExt->sSliceHeader.uiDisableDeblockingFilterIdc != 0);
+	pFilter.iCsStride[0] = pCurDq->pDecPic->iLineSize[0];
+	pFilter.iCsStride[1] = pCurDq->pDecPic->iLineSize[1];
+	pFilter.iCsStride[2] = pCurDq->pDecPic->iLineSize[2];
+	pFilter.iSliceAlphaC0Offset = sSliceHeaderExt->sSliceHeader.iSliceAlphaC0Offset;
+	pFilter.iSliceBetaOffset     = sSliceHeaderExt->sSliceHeader.iSliceBetaOffset;
+	pFilter.iMbStride             = kiMbWidth;
+	
+	iNextMbIdx  = sSliceHeaderExt->sSliceHeader.iFirstMbInSlice;
+
+	for ( ; ; )
+	{
+		iCurMbIdx	= iNextMbIdx;
+		pCurrentMbBlock = &pMbList[ iCurMbIdx ];	
+
+		pFilter.pCsData[0] = pCurDq->pDecPic->pData[0] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[0]) << 4);
+		pFilter.pCsData[1] = pCurDq->pDecPic->pData[1] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[1]) << 3);
+		pFilter.pCsData[2] = pCurDq->pDecPic->pData[2] + ((pCurrentMbBlock->iMbX + pCurrentMbBlock->iMbY * pFilter.iCsStride[2]) << 3);
+
+		DeblockingMbAvcbase( pFunc, pCurrentMbBlock, &pFilter);
+
+		++iNumMbFiltered;
+		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
+		//whether all of MB in current slice filtered or not
+		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbFiltered >= kiTotalNumMb )
+		{
+			break;
+		}				
+	}
+}
+
+void PerformDeblockingFilter( sWelsEncCtx *pEnc )
+{	
+	const int32_t kiCurDid				= pEnc->uiDependencyId;
+	SWelsSvcCodingParam *pSvcParam	= pEnc->pSvcParam;
+	SDLayerParam *pDlp					= &pSvcParam->sDependencyLayers[kiCurDid];
+	SDqLayer *pCurLayer					= pEnc->pCurDqLayer;
+
+	if ( pCurLayer->iLoopFilterDisableIdc == 0 )
+	{
+		DeblockingFilterFrameAvcbase( pCurLayer, pEnc->pFuncList );
+	}
+	else if ( pCurLayer->iLoopFilterDisableIdc == 2 )
+	{		
+		int32_t iSliceCount			= 0;
+		int32_t iSliceIdx			= 0;
+
+		if ( SM_DYN_SLICE != pDlp->sMso.uiSliceMode )
+		{
+			iSliceCount	= GetCurrentSliceNum( pCurLayer->pSliceEncCtx );
+			do {
+				DeblockingFilterSliceAvcbase( pCurLayer, pEnc->pFuncList, iSliceIdx );
+				++ iSliceIdx;
+			} while(iSliceIdx < iSliceCount);
+		}
+		else	// for dynamic slicing mode
+		{
+			const int32_t kiNumPicPartition	= pEnc->iActiveThreadsNum; //pSvcParam->iCountThreadsNum;
+			int32_t iPartitionIdx			= 0;
+
+			while ( iPartitionIdx < kiNumPicPartition )
+			{
+				iSliceCount	= pCurLayer->pNumSliceCodedOfPartition[iPartitionIdx];
+				iSliceIdx	= iPartitionIdx;
+				do {
+					DeblockingFilterSliceAvcbase( pCurLayer, pEnc->pFuncList, iSliceIdx );
+					iSliceIdx += kiNumPicPartition;
+				} while(iSliceIdx < iSliceCount);
+				++ iPartitionIdx;
+			}
+		}
+	}
+}
+
+void WelsNonZeroCount_c(int8_t* pNonZeroCount)
+{
+	int32_t i;
+	int32_t iIndex;
+
+	for( i=0;i<24;i++ ){
+		iIndex = g_kuiMbCountScan4Idx[i];
+		pNonZeroCount[iIndex] = !!pNonZeroCount[iIndex];
+	}
+}
+void WelsBlockFuncInit( PSetNoneZeroCountZeroFunc *pfSetNZCZero,  int32_t iCpu )
+{
+	*pfSetNZCZero = WelsNonZeroCount_c;
+}
+
+
+#ifdef X86_ASM
+
+extern "C" {
+void DeblockLumaLt4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t *pTc)
+{
+    ENFORCE_STACK_ALIGN_1D(uint8_t,  uiBuf,   16*8, 16);
+    
+    DeblockLumaTransposeH2V_sse2(pPixY - 4, iStride, &uiBuf[0]);
+	DeblockLumaLt4V_sse2(&uiBuf[4*16], 16, iAlpha, iBeta, pTc);
+	DeblockLumaTransposeV2H_sse2(pPixY - 4, iStride, &uiBuf[0]);
+}
+
+void DeblockLumaEq4H_sse2(uint8_t *pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t,  uiBuf,   16*8, 16);
+    
+    DeblockLumaTransposeH2V_sse2(pPixY - 4, iStride, &uiBuf[0]);
+	DeblockLumaEq4V_sse2(&uiBuf[4*16], 16, iAlpha, iBeta);
+	DeblockLumaTransposeV2H_sse2(pPixY - 4, iStride, &uiBuf[0]);
+}
+
+}
+
+#endif
+
+
+void  DeblockingInit( DeblockingFunc  * pFunc,  int32_t iCpu )
+{
+	pFunc->pfLumaDeblockingLT4Ver		= DeblockLumaLt4V_c;
+	pFunc->pfLumaDeblockingEQ4Ver		= DeblockLumaEq4V_c;
+	pFunc->pfLumaDeblockingLT4Hor		= DeblockLumaLt4H_c;
+	pFunc->pfLumaDeblockingEQ4Hor		= DeblockLumaEq4H_c;
+
+	pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_c;
+	pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_c;
+	pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_c;
+	pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_c;
+
+
+#ifdef X86_ASM
+	if( iCpu & WELS_CPU_SSE2 ){
+	    pFunc->pfLumaDeblockingLT4Ver	= DeblockLumaLt4V_sse2;
+	    pFunc->pfLumaDeblockingEQ4Ver	= DeblockLumaEq4V_sse2;
+		pFunc->pfLumaDeblockingLT4Hor   = DeblockLumaLt4H_sse2;
+		pFunc->pfLumaDeblockingEQ4Hor   = DeblockLumaEq4H_sse2;
+	    pFunc->pfChromaDeblockingLT4Ver	= DeblockChromaLt4V_sse2;
+	    pFunc->pfChromaDeblockingEQ4Ver	= DeblockChromaEq4V_sse2;
+	    pFunc->pfChromaDeblockingLT4Hor	= DeblockChromaLt4H_sse2;
+	    pFunc->pfChromaDeblockinEQ4Hor	= DeblockChromaEq4H_sse2;
+	}
+#endif		
+}
+
+
+} // namespace WelsSVCEnc
+
--- /dev/null
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -1,0 +1,301 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <string.h>
+#include "decode_mb_aux.h"
+#include "wels_common_basis.h"
+#include "cpu_core.h"
+
+namespace WelsSVCEnc {
+/****************************************************************************
+ * Dequant and Ihdm functions
+ ****************************************************************************/
+void WelsIHadamard4x4Dc(int16_t *pRes) //pBuffer size : 4x4
+{
+	int16_t iTemp[4];
+	int32_t i	= 4;
+
+	while( --i >= 0 )
+	{
+		const int32_t kiIdx	= i<<2;
+		const int32_t kiIdx1	= 1 + kiIdx;
+		const int32_t kiIdx2	= 1 + kiIdx1;
+		const int32_t kiIdx3	= 1 + kiIdx2;
+
+		iTemp[0] = pRes[kiIdx ] + pRes[kiIdx2];
+		iTemp[1] = pRes[kiIdx ] - pRes[kiIdx2];
+		iTemp[2] = pRes[kiIdx1] - pRes[kiIdx3];
+		iTemp[3] = pRes[kiIdx1] + pRes[kiIdx3];
+
+		pRes[kiIdx ] = iTemp[0] + iTemp[3];
+		pRes[kiIdx1] = iTemp[1] + iTemp[2];
+		pRes[kiIdx2] = iTemp[1] - iTemp[2];
+		pRes[kiIdx3] = iTemp[0] - iTemp[3];		
+	}
+
+	i = 4;
+	while( --i >= 0 )
+	{
+		const int32_t kiI4	= 4 + i;
+		const int32_t kiI8	= 4 + kiI4;
+		const int32_t kiI12	= 4 + kiI8;
+
+		iTemp[0] = pRes[i  ] + pRes[kiI8 ];
+		iTemp[1] = pRes[i  ] - pRes[kiI8 ];
+		iTemp[2] = pRes[kiI4 ] - pRes[kiI12];
+		iTemp[3] = pRes[kiI4 ] + pRes[kiI12];
+
+		pRes[i  ] = iTemp[0] + iTemp[3];
+		pRes[kiI4 ] = iTemp[1] + iTemp[2];
+		pRes[kiI8 ] = iTemp[1] - iTemp[2];
+		pRes[kiI12] = iTemp[0] - iTemp[3];
+	}
+}
+
+/* for qp < 12 */
+void WelsDequantLumaDc4x4(int16_t *pRes, const int32_t kiQp)
+{
+	int32_t i	= 15;
+	const uint16_t kuiDequantValue	= g_kuiDequantCoeff[kiQp%6][0];
+	const int16_t kiQF0		= kiQp / 6; 
+	const int16_t kiQF1		= 2 - kiQF0;
+	const int16_t kiQF0S	= 1 << (1 - kiQF0);
+	
+	while ( i >= 0 )
+	{
+		pRes[i  ] = ( pRes[i  ] * kuiDequantValue + kiQF0S ) >> kiQF1; 
+		pRes[i-1] = ( pRes[i-1] * kuiDequantValue + kiQF0S ) >> kiQF1; 
+		pRes[i-2] = ( pRes[i-2] * kuiDequantValue + kiQF0S ) >> kiQF1; 
+		pRes[i-3] = ( pRes[i-3] * kuiDequantValue + kiQF0S ) >> kiQF1; 
+
+		i -= 4;
+	}
+}
+
+/* for qp >= 12 */
+void WelsDequantIHadamard4x4_c(int16_t *pRes, const uint16_t kuiMF)
+{
+	int16_t iTemp[4];
+	int32_t i;
+
+	for(i = 0; i < 16; i += 4)
+	{
+		iTemp[0] = pRes[i  ] + pRes[i+2];
+		iTemp[1] = pRes[i  ] - pRes[i+2];
+		iTemp[2] = pRes[i+1] - pRes[i+3];
+		iTemp[3] = pRes[i+1] + pRes[i+3];
+
+		pRes[i  ] = iTemp[0] + iTemp[3];
+		pRes[i+1] = iTemp[1] + iTemp[2];
+		pRes[i+2] = iTemp[1] - iTemp[2];
+		pRes[i+3] = iTemp[0] - iTemp[3];		
+	}
+
+	for(i = 0; i < 4; i++)
+	{
+		iTemp[0] = pRes[i   ] + pRes[i+8 ];
+		iTemp[1] = pRes[i   ] - pRes[i+8 ];
+		iTemp[2] = pRes[i+4 ] - pRes[i+12];
+		iTemp[3] = pRes[i+4 ] + pRes[i+12];
+
+		pRes[i  ]  = (iTemp[0] + iTemp[3]) * kuiMF;
+		pRes[i+4 ] = (iTemp[1] + iTemp[2]) * kuiMF;
+		pRes[i+8 ] = (iTemp[1] - iTemp[2]) * kuiMF;
+		pRes[i+12] = (iTemp[0] - iTemp[3]) * kuiMF;
+	}	
+}
+
+void WelsDequantIHadamard2x2Dc( int16_t* pDct, const uint16_t kuiMF)
+{
+	const int16_t kiSumU = pDct[0] + pDct[2];
+	const int16_t kiDelU =   pDct[0] -  pDct[2];
+	const int16_t kiSumD = pDct[1] + pDct[3];
+	const int16_t kiDelD =   pDct[1] -  pDct[3];
+	
+	pDct[0] = (kiSumU + kiSumD) * kuiMF;
+    pDct[1] = (kiSumU  -  kiSumD) * kuiMF;
+    pDct[2] = (kiDelU   + kiDelD)   * kuiMF;
+    pDct[3] = (kiDelU   - kiDelD)   * kuiMF;
+}
+
+void WelsDequant4x4_c(int16_t *pRes, const uint16_t* kpMF)
+{
+	int32_t i;
+	for(i = 0; i < 8; i++)
+	{
+		pRes[i]	*=	kpMF[i];
+		pRes[i+8]	*=kpMF[i];
+	}
+}
+
+void WelsDequantFour4x4_c(int16_t *pRes, const uint16_t* kpMF)
+{
+	int32_t i;
+	for(i = 0; i < 8; i++)
+	{
+		pRes[i]	*=	kpMF[i];
+		pRes[i+8]	*=	kpMF[i];
+		pRes[i+16]*=	kpMF[i];
+		pRes[i+24]*=	kpMF[i];
+		pRes[i+32]*=	kpMF[i];
+		pRes[i+40]*=	kpMF[i];
+		pRes[i+48]*=	kpMF[i];
+		pRes[i+56]*=	kpMF[i];
+	}	
+}
+
+/****************************************************************************
+ * IDCT functions, final output = prediction(CS) + IDCT(scaled_coeff)
+ ****************************************************************************/
+void WelsIDctT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct )
+{
+	int32_t i;
+	int16_t iTemp[16];	
+
+	int32_t iDstStridex2 = iStride << 1;
+	int32_t iDstStridex3 = iStride + iDstStridex2;
+	int32_t iPredStridex2 = iPredStride << 1;
+	int32_t iPredStridex3 = iPredStride + iPredStridex2;
+
+	for (i = 0; i < 4; i ++) //horizon
+	{		
+		int32_t iIdx = i << 2;
+		const int32_t kiHorSumU = pDct[iIdx] + pDct[iIdx+2];	// add 0-2
+		const int32_t kiHorDelU = pDct[iIdx] - pDct[iIdx+2];	// sub 0-2
+		const int32_t kiHorSumD = pDct[iIdx+1] + (pDct[iIdx+3] >> 1);
+		const int32_t kiHorDelD = (pDct[iIdx+1] >> 1) - pDct[iIdx+3];
+
+		iTemp[iIdx  ]   = kiHorSumU  + kiHorSumD;		
+		iTemp[iIdx+1] = kiHorDelU   + kiHorDelD;
+		iTemp[iIdx+2] = kiHorDelU   -  kiHorDelD;
+		iTemp[iIdx+3] = kiHorSumU  -  kiHorSumD;
+	}
+
+	for (i = 0; i < 4; i ++) //vertical
+	{
+		const int32_t kiVerSumL = iTemp[i]                 + iTemp[8+i];
+		const int32_t kiVerDelL   = iTemp[i]                 - iTemp[8+i];
+		const int32_t kiVerDelR   = (iTemp[4+i] >> 1) - iTemp[12+i];
+		const int32_t kiVerSumR = iTemp[4+i]             + (iTemp[12+i] >> 1);
+
+		pRec[i				]         = WELS_CLIP1( pPred[i              ]         + ((kiVerSumL + kiVerSumR + 32) >> 6) );
+		pRec[iStride+i		]     = WELS_CLIP1( pPred[iPredStride+i  ]  + ((kiVerDelL + kiVerDelR + 32) >> 6) );
+		pRec[iDstStridex2 + i] = WELS_CLIP1( pPred[iPredStridex2+i] + ((kiVerDelL - kiVerDelR + 32) >> 6) );
+		pRec[iDstStridex3 + i] = WELS_CLIP1( pPred[iPredStridex3+i] + ((kiVerSumL - kiVerSumR + 32) >> 6) );
+	}	
+}
+
+void WelsIDctFourT4Rec_c( uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct )
+{
+	int32_t iDstStridex4  = iStride << 2;
+	int32_t iPredStridex4 = iPredStride << 2;
+	WelsIDctT4Rec_c( pRec,                  iStride, pPred,						iPredStride, pDct	);
+	WelsIDctT4Rec_c( &pRec[4],              iStride, &pPred[4],					iPredStride, pDct+16 );
+	WelsIDctT4Rec_c( &pRec[iDstStridex4  ], iStride, &pPred[iPredStridex4  ],	iPredStride, pDct+32 );	
+	WelsIDctT4Rec_c( &pRec[iDstStridex4+4], iStride, &pPred[iPredStridex4+4],	iPredStride, pDct+48 );
+
+}
+
+void WelsIDctT4RecOnMb(uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct, PIDctFunc pfIDctFourT4)
+{
+	int32_t iDstStridex8  = iDstStride << 3;
+	int32_t iPredStridex8 = iPredStride << 3;
+	
+	pfIDctFourT4(&pDst[0], iDstStride, &pPred[0], iPredStride, pDct);
+	pfIDctFourT4(&pDst[8], iDstStride, &pPred[8], iPredStride, pDct+64);
+	pfIDctFourT4(&pDst[iDstStridex8], iDstStride, &pPred[iPredStridex8], iPredStride, pDct+128);
+	pfIDctFourT4(&pDst[iDstStridex8+8], iDstStride, &pPred[iPredStridex8+8], iPredStride, pDct+192);
+}
+
+/* 
+ * pfIDctI16x16Dc: do luma idct of an MB for I16x16 mode, when only dc value are non-zero
+ */
+void WelsIDctRecI16x16Dc_c(uint8_t *pRec, int32_t iStride, uint8_t *pPred, int32_t iPredStride, int16_t *pDctDc)
+{
+	int32_t i, j;
+
+	for (i = 0; i < 16; i ++) 
+	{
+		for(j = 0; j < 16; j++)
+		{
+			pRec[j] = WELS_CLIP1( pPred[j] + ((pDctDc[(i&0x0C) + (j>>2)] + 32) >> 6) );
+		}
+		pRec += iStride;
+		pPred += iPredStride;
+	}
+}
+
+void WelsGetEncBlockStrideOffset(int32_t *pBlock, const int32_t kiStrideY, const int32_t kiStrideUV)
+{
+	int32_t i, j, k, r;	
+	for(j = 0; j < 4; j++)
+	{
+		i = j << 2;
+		k = (j&0x01) << 1;
+		r = j&0x02;
+		pBlock[i]		= (0 + k + (0 + r) * kiStrideY) << 2;
+		pBlock[i+1]	= (1 + k + (0 + r) * kiStrideY) << 2;
+		pBlock[i+2]	= (0 + k + (1 + r) * kiStrideY) << 2;
+		pBlock[i+3]	= (1 + k + (1 + r) * kiStrideY) << 2;
+
+		pBlock[16+j]	=
+		pBlock[20+j]	= ((j&0x01) + r * kiStrideUV) << 2;
+	}
+}
+
+void WelsInitReconstructionFuncs( SWelsFuncPtrList *pFuncList, uint32_t  uiCpuFlag )
+{
+	pFuncList->pfDequantization4x4			= WelsDequant4x4_c;
+	pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_c;
+	pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_c;
+
+	pFuncList->pfIDctT4		= WelsIDctT4Rec_c;
+	pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_c;
+	pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_c;
+
+#if defined(X86_ASM)
+	if ( uiCpuFlag & WELS_CPU_MMXEXT )
+	{
+    	pFuncList->pfIDctT4		= WelsIDctT4Rec_mmx;
+	}
+	if ( uiCpuFlag & WELS_CPU_SSE2 )
+	{
+		pFuncList->pfDequantization4x4			= WelsDequant4x4_sse2;
+		pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_sse2;
+		pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_sse2;
+
+		pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_sse2;
+		pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
+	}
+#endif//X86_ASM
+}
+}
--- /dev/null
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -1,0 +1,608 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <string.h>
+
+#include "macros.h"
+#include "ls_defines.h"
+#include "encode_mb_aux.h"
+#include "cpu_core.h"
+#include "as264_common.h"
+#include "svc_encode_mb.h"
+namespace WelsSVCEnc {
+
+__align16( int16_t, g_kiQuantInterFF[58][8] )=
+{
+/* 0*/ {   0,   1,   0,   1,   1,   1,   1,   1 }, 
+/* 1*/ {   0,   1,   0,   1,   1,   1,   1,   1 }, 
+/* 2*/ {   1,   1,   1,   1,   1,   1,   1,   1 }, 
+/* 3*/ {   1,   1,   1,   1,   1,   1,   1,   1 }, 
+/* 4*/ {   1,   1,   1,   1,   1,   2,   1,   2 }, 
+/* 5*/ {   1,   1,   1,   1,   1,   2,   1,   2 }, 
+/* 6*/ {   1,   1,   1,   1,   1,   2,   1,   2 }, 
+/* 7*/ {   1,   1,   1,   1,   1,   2,   1,   2 }, 
+/* 8*/ {   1,   2,   1,   2,   2,   3,   2,   3 }, 
+/* 9*/ {   1,   2,   1,   2,   2,   3,   2,   3 }, 
+/*10*/ {   1,   2,   1,   2,   2,   3,   2,   3 }, 
+/*11*/ {   1,   2,   1,   2,   2,   4,   2,   4 }, 
+/*12*/ {   2,   3,   2,   3,   3,   4,   3,   4 }, 
+/*13*/ {   2,   3,   2,   3,   3,   5,   3,   5 }, 
+/*14*/ {   2,   3,   2,   3,   3,   5,   3,   5 }, 
+/*15*/ {   2,   4,   2,   4,   4,   6,   4,   6 }, 
+/*16*/ {   3,   4,   3,   4,   4,   7,   4,   7 }, 
+/*17*/ {   3,   5,   3,   5,   5,   8,   5,   8 }, 
+/*18*/ {   3,   5,   3,   5,   5,   8,   5,   8 }, 
+/*19*/ {   4,   6,   4,   6,   6,   9,   6,   9 }, 
+/*20*/ {   4,   7,   4,   7,   7,  10,   7,  10 }, 
+/*21*/ {   5,   8,   5,   8,   8,  12,   8,  12 }, 
+/*22*/ {   5,   8,   5,   8,   8,  13,   8,  13 }, 
+/*23*/ {   6,  10,   6,  10,  10,  15,  10,  15 }, 
+/*24*/ {   7,  11,   7,  11,  11,  17,  11,  17 }, 
+/*25*/ {   7,  12,   7,  12,  12,  19,  12,  19 }, 
+/*26*/ {   9,  13,   9,  13,  13,  21,  13,  21 }, 
+/*27*/ {   9,  15,   9,  15,  15,  24,  15,  24 }, 
+/*28*/ {  11,  17,  11,  17,  17,  26,  17,  26 }, 
+/*29*/ {  12,  19,  12,  19,  19,  30,  19,  30 }, 
+/*30*/ {  13,  22,  13,  22,  22,  33,  22,  33 }, 
+/*31*/ {  15,  23,  15,  23,  23,  38,  23,  38 }, 
+/*32*/ {  17,  27,  17,  27,  27,  42,  27,  42 }, 
+/*33*/ {  19,  30,  19,  30,  30,  48,  30,  48 }, 
+/*34*/ {  21,  33,  21,  33,  33,  52,  33,  52 }, 
+/*35*/ {  24,  38,  24,  38,  38,  60,  38,  60 }, 
+/*36*/ {  27,  43,  27,  43,  43,  67,  43,  67 }, 
+/*37*/ {  29,  47,  29,  47,  47,  75,  47,  75 }, 
+/*38*/ {  35,  53,  35,  53,  53,  83,  53,  83 }, 
+/*39*/ {  37,  60,  37,  60,  60,  96,  60,  96 }, 
+/*40*/ {  43,  67,  43,  67,  67, 104,  67, 104 },
+/*41*/ {  48,  77,  48,  77,  77, 121,  77, 121 },
+/*42*/ {  53,  87,  53,  87,  87, 133,  87, 133 },
+/*43*/ {  59,  93,  59,  93,  93, 150,  93, 150 },
+/*44*/ {  69, 107,  69, 107, 107, 167, 107, 167 },
+/*45*/ {  75, 120,  75, 120, 120, 192, 120, 192 },
+/*46*/ {  85, 133,  85, 133, 133, 208, 133, 208 },
+/*47*/ {  96, 153,  96, 153, 153, 242, 153, 242 },
+/*48*/ { 107, 173, 107, 173, 173, 267, 173, 267 },
+/*49*/ { 117, 187, 117, 187, 187, 300, 187, 300 },
+/*50*/ { 139, 213, 139, 213, 213, 333, 213, 333 },
+/*51*/ { 149, 240, 149, 240, 240, 383, 240, 383 },
+/* from here below is only for intra */  
+/*46*/ { 171, 267, 171, 267, 267, 417, 267, 417 },
+/*47*/ { 192, 307, 192, 307, 307, 483, 307, 483 },
+/*48*/ { 213, 347, 213, 347, 347, 533, 347, 533 },
+/*49*/ { 235, 373, 235, 373, 373, 600, 373, 600 },
+/*50*/ { 277, 427, 277, 427, 427, 667, 427, 667 },
+/*51*/ { 299, 480, 299, 480, 480, 767, 480, 767 },  
+};
+
+
+
+__align16( int16_t, g_kiQuantMF[52][8]) = {
+/* 0*/	{26214, 16132, 26214, 16132, 16132, 10486, 16132, 10486 }, 
+/* 1*/	{23832, 14980, 23832, 14980, 14980,  9320, 14980,  9320 }, 
+/* 2*/	{20164, 13108, 20164, 13108, 13108,  8388, 13108,  8388 }, 
+/* 3*/	{18724, 11650, 18724, 11650, 11650,  7294, 11650,  7294 }, 
+/* 4*/	{16384, 10486, 16384, 10486, 10486,  6710, 10486,  6710 }, 
+/* 5*/	{14564,  9118, 14564,  9118,  9118,  5786,  9118,  5786 }, 
+/* 6*/	{13107,  8066, 13107,  8066,  8066,  5243,  8066,  5243 }, 
+/* 7*/	{11916,  7490, 11916,  7490,  7490,  4660,  7490,  4660 }, 
+/* 8*/	{10082,  6554, 10082,  6554,  6554,  4194,  6554,  4194 }, 
+/* 9*/	{ 9362,  5825,  9362,  5825,  5825,  3647,  5825,  3647 }, 
+/*10*/	{ 8192,  5243,  8192,  5243,  5243,  3355,  5243,  3355 }, 
+/*11*/	{ 7282,  4559,  7282,  4559,  4559,  2893,  4559,  2893 }, 
+/*12*/	{ 6554,  4033,  6554,  4033,  4033,  2622,  4033,  2622 }, 
+/*13*/	{ 5958,  3745,  5958,  3745,  3745,  2330,  3745,  2330 }, 
+/*14*/	{ 5041,  3277,  5041,  3277,  3277,  2097,  3277,  2097 }, 
+/*15*/	{ 4681,  2913,  4681,  2913,  2913,  1824,  2913,  1824 }, 
+/*16*/	{ 4096,  2622,  4096,  2622,  2622,  1678,  2622,  1678 }, 
+/*17*/	{ 3641,  2280,  3641,  2280,  2280,  1447,  2280,  1447 }, 
+/*18*/	{ 3277,  2017,  3277,  2017,  2017,  1311,  2017,  1311 }, 
+/*19*/	{ 2979,  1873,  2979,  1873,  1873,  1165,  1873,  1165 }, 
+/*20*/	{ 2521,  1639,  2521,  1639,  1639,  1049,  1639,  1049 }, 
+/*21*/	{ 2341,  1456,  2341,  1456,  1456,   912,  1456,   912 }, 
+/*22*/	{ 2048,  1311,  2048,  1311,  1311,   839,  1311,   839 }, 
+/*23*/	{ 1821,  1140,  1821,  1140,  1140,   723,  1140,   723 }, 
+/*24*/	{ 1638,  1008,  1638,  1008,  1008,   655,  1008,   655 }, 
+/*25*/	{ 1490,   936,  1490,   936,   936,   583,   936,   583 }, 
+/*26*/	{ 1260,   819,  1260,   819,   819,   524,   819,   524 }, 
+/*27*/	{ 1170,   728,  1170,   728,   728,   456,   728,   456 }, 
+/*28*/	{ 1024,   655,  1024,   655,   655,   419,   655,   419 }, 
+/*29*/	{  910,   570,   910,   570,   570,   362,   570,   362 }, 
+/*30*/	{  819,   504,   819,   504,   504,   328,   504,   328 }, 
+/*31*/	{  745,   468,   745,   468,   468,   291,   468,   291 }, 
+/*32*/	{  630,   410,   630,   410,   410,   262,   410,   262 }, 
+/*33*/	{  585,   364,   585,   364,   364,   228,   364,   228 }, 
+/*34*/	{  512,   328,   512,   328,   328,   210,   328,   210 }, 
+/*35*/	{  455,   285,   455,   285,   285,   181,   285,   181 }, 
+/*36*/	{  410,   252,   410,   252,   252,   164,   252,   164 }, 
+/*37*/	{  372,   234,   372,   234,   234,   146,   234,   146 }, 
+/*38*/	{  315,   205,   315,   205,   205,   131,   205,   131 }, 
+/*39*/	{  293,   182,   293,   182,   182,   114,   182,   114 }, 
+/*40*/	{  256,   164,   256,   164,   164,   105,   164,   105 }, 
+/*41*/	{  228,   142,   228,   142,   142,    90,   142,    90 }, 
+/*42*/	{  205,   126,   205,   126,   126,    82,   126,    82 }, 
+/*43*/	{  186,   117,   186,   117,   117,    73,   117,    73 }, 
+/*44*/	{  158,   102,   158,   102,   102,    66,   102,    66 }, 
+/*45*/	{  146,    91,   146,    91,    91,    57,    91,    57 }, 
+/*46*/	{  128,    82,   128,    82,    82,    52,    82,    52 }, 
+/*47*/	{  114,    71,   114,    71,    71,    45,    71,    45 }, 
+/*48*/	{  102,    63,   102,    63,    63,    41,    63,    41 }, 
+/*49*/	{   93,    59,    93,    59,    59,    36,    59,    36 }, 
+/*50*/	{   79,    51,    79,    51,    51,    33,    51,    33 }, 
+/*51*/	{   73,    46,    73,    46,    46,    28,    46,    28 }  
+};
+
+/****************************************************************************
+ * HDM and Quant functions 
+ ****************************************************************************/
+#define WELS_ABS_LC(a) ((iSign ^ (int32_t)(a)) - iSign)
+#define NEW_QUANT(pDct, iFF, iMF) (((iFF)+ WELS_ABS_LC(pDct))*(iMF)) >>16
+#define WELS_NEW_QUANT(pDct,iFF,iMF)	WELS_ABS_LC(NEW_QUANT(pDct, iFF, iMF))
+void WelsQuant4x4_c(int16_t *pDct, int16_t* pFF,  int16_t *pMF)
+{
+	int32_t i, j, iSign;
+    for( i = 0; i < 16; i+=4 )
+    {
+		j = i & 0x07;
+        iSign = WELS_SIGN(pDct[i]);
+		pDct[i] = WELS_NEW_QUANT(pDct[i], pFF[j], pMF[j]);
+		iSign = WELS_SIGN(pDct[i+1]);
+		pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], pFF[j+1], pMF[j+1]);
+		iSign = WELS_SIGN(pDct[i+2]);
+		pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], pFF[j+2], pMF[j+2]);
+		iSign = WELS_SIGN(pDct[i+3]);
+		pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], pFF[j+3], pMF[j+3]);
+    }
+}
+
+void WelsQuant4x4Dc_c(int16_t *pDct, int16_t iFF,  int16_t iMF)
+{
+	int32_t i, iSign;
+	for(i = 0; i < 16; i+=4)
+	{
+		iSign = WELS_SIGN(pDct[i]);
+		pDct[i] = WELS_NEW_QUANT(pDct[i], iFF, iMF);
+		iSign = WELS_SIGN(pDct[i+1]);
+		pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], iFF, iMF);
+		iSign = WELS_SIGN(pDct[i+2]);
+		pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], iFF, iMF);
+		iSign = WELS_SIGN(pDct[i+3]);
+		pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], iFF, iMF);
+	}
+}
+
+void WelsQuantFour4x4_c(int16_t *pDct, int16_t* pFF,  int16_t *pMF)
+{
+	int32_t i, j, iSign;
+
+    for( i = 0; i < 64; i+=4 )
+    {
+		j = i & 0x07;
+        iSign = WELS_SIGN(pDct[i]);
+		pDct[i] = WELS_NEW_QUANT(pDct[i], pFF[j], pMF[j]);
+		iSign = WELS_SIGN(pDct[i+1]);
+		pDct[i+1] = WELS_NEW_QUANT(pDct[i+1], pFF[j+1], pMF[j+1]);
+		iSign = WELS_SIGN(pDct[i+2]);
+		pDct[i+2] = WELS_NEW_QUANT(pDct[i+2], pFF[j+2], pMF[j+2]);
+		iSign = WELS_SIGN(pDct[i+3]);
+		pDct[i+3] = WELS_NEW_QUANT(pDct[i+3], pFF[j+3], pMF[j+3]);
+    }
+}
+
+void WelsQuantFour4x4Max_c(int16_t *pDct, int16_t* pFF,  int16_t *pMF, int16_t *pMax)
+{
+	int32_t i, j, k, iSign;
+	int16_t iMaxAbs;
+	for( k = 0; k < 4; k++)
+	{
+		iMaxAbs = 0;
+		for( i = 0; i < 16; i++ )
+		{
+			j = i & 0x07;
+			iSign = WELS_SIGN(pDct[i]);
+			pDct[i] = NEW_QUANT(pDct[i], pFF[j], pMF[j]);
+			if( iMaxAbs < pDct[i]) iMaxAbs = pDct[i];
+			pDct[i] = WELS_ABS_LC(pDct[i]);
+		}
+		pDct += 16;
+		pMax[k] = iMaxAbs;
+	}
+}
+
+int32_t WelsHadamardQuant2x2Skip_c(int16_t *pRs, int16_t iFF,  int16_t iMF)
+{
+	int16_t pDct[4], s[4];
+	int16_t iThreshold = ((1<<16)-1)/iMF - iFF;	
+
+	s[0] = pRs[0]  + pRs[32];
+    s[1] = pRs[0]  - pRs[32];
+    s[2] = pRs[16] + pRs[48];
+    s[3] = pRs[16] - pRs[48];
+
+    pDct[0] = s[0] + s[2];
+    pDct[1] = s[0] - s[2];
+    pDct[2] = s[1] + s[3];
+    pDct[3] = s[1] - s[3];
+
+	return ((WELS_ABS(pDct[0]) > iThreshold) || (WELS_ABS(pDct[1]) > iThreshold) || (WELS_ABS(pDct[2]) > iThreshold) || (WELS_ABS(pDct[3]) > iThreshold));
+}
+
+int32_t WelsHadamardQuant2x2_c(int16_t *pRs, const int16_t iFF, int16_t iMF, int16_t * pDct, int16_t * pBlock)
+{
+	int16_t s[4];
+	int32_t iSign, i, iDcNzc = 0;
+
+	s[0] = pRs[0]  + pRs[32];
+    s[1] = pRs[0]  - pRs[32];
+    s[2] = pRs[16] + pRs[48];
+    s[3] = pRs[16] - pRs[48];
+
+	pRs[0] = 0;
+	pRs[16] = 0;
+	pRs[32] = 0;
+	pRs[48] = 0;
+
+    pDct[0] = s[0] + s[2];
+    pDct[1] = s[0] - s[2];
+    pDct[2] = s[1] + s[3];
+    pDct[3] = s[1] - s[3];
+
+	iSign = WELS_SIGN(pDct[0]);
+	pDct[0] = WELS_NEW_QUANT(pDct[0], iFF, iMF);
+	iSign = WELS_SIGN(pDct[1]);
+	pDct[1] = WELS_NEW_QUANT(pDct[1], iFF, iMF);
+	iSign = WELS_SIGN(pDct[2]);
+	pDct[2] = WELS_NEW_QUANT(pDct[2], iFF, iMF);
+	iSign = WELS_SIGN(pDct[3]);
+	pDct[3] = WELS_NEW_QUANT(pDct[3], iFF, iMF);
+
+	ST64( pBlock, LD64(pDct) );
+
+	for(i=0; i<4; i++)	
+		iDcNzc += (pBlock[i] != 0);
+	return iDcNzc;
+}
+
+/* dc value pick up and hdm_4x4 */
+void WelsHadamardT4Dc_c( int16_t *pLumaDc, int16_t *pDct)
+{
+	int32_t p[16], s[4];
+	int32_t i, iIdx;
+
+    for(i = 0 ; i < 16 ; i +=4)
+    {
+    	iIdx = ((i&0x08) << 4) +((i&0x04) << 3);		
+		s[0] = pDct[iIdx ]	+ pDct[iIdx+80];
+        s[3] = pDct[iIdx ]	- pDct[iIdx+80];
+        s[1] = pDct[iIdx+16]	+ pDct[iIdx+64];
+        s[2] = pDct[iIdx+16]	- pDct[iIdx+64];
+
+        p[i  ] = s[0] + s[1];
+        p[i+2] = s[0] - s[1];
+        p[i+1] = s[3] + s[2];
+        p[i+3] = s[3] - s[2];
+    }
+
+    for(i = 0 ; i < 4 ; i ++)
+    {
+        s[0] = p[i ]	+ p[i+12];
+        s[3] = p[i ]	- p[i+12];
+        s[1] = p[i+4]	+ p[i+ 8];
+        s[2] = p[i+4]	- p[i+ 8];
+
+		pLumaDc[i  ] = WELS_CLIP3((s[0] + s[1] + 1) >> 1, -32768, 32767);
+		pLumaDc[i+8 ] = WELS_CLIP3((s[0] - s[1] + 1) >> 1, -32768, 32767);
+		pLumaDc[i+4 ] = WELS_CLIP3((s[3] + s[2] + 1) >> 1, -32768, 32767);
+		pLumaDc[i+12] = WELS_CLIP3((s[3] - s[2] + 1) >> 1, -32768, 32767);
+    }
+}
+
+/****************************************************************************
+ * DCT functions
+ ****************************************************************************/
+void WelsDctT4_c( int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 )
+{
+	int16_t i, pData[16], s[4];
+    for(i = 0 ; i < 16 ; i +=4)
+    {    	
+		const int32_t kiI1= 1 + i;
+		const int32_t kiI2= 2 + i;
+		const int32_t kiI3= 3 + i;
+		
+		pData[i ] = pPixel1[0] - pPixel2[0];
+		pData[kiI1] = pPixel1[1] - pPixel2[1];
+		pData[kiI2] = pPixel1[2] - pPixel2[2];
+		pData[kiI3] = pPixel1[3] - pPixel2[3];
+
+        pPixel1 += iStride1;
+        pPixel2 += iStride2;
+
+		/*horizontal transform */
+        s[0] = pData[i] + pData[kiI3];
+        s[3] = pData[i] - pData[kiI3];
+        s[1] = pData[kiI1] + pData[kiI2];
+        s[2] = pData[kiI1] - pData[kiI2];
+
+        pDct[i ] = s[0] + s[1];
+        pDct[kiI2] = s[0] - s[1];
+        pDct[kiI1] = (s[3] << 1) + s[2];
+        pDct[kiI3] = s[3] - (s[2] << 1);
+    }
+
+    /* vertical transform */
+    for(i = 0 ; i < 4 ; i ++)
+    {
+		const int32_t kiI4	= 4 + i;
+		const int32_t kiI8	= 8 + i;
+		const int32_t kiI12	= 12 + i;
+
+        s[0] = pDct[i ] + pDct[kiI12];
+        s[3] = pDct[i ] - pDct[kiI12];
+        s[1] = pDct[kiI4] + pDct[kiI8 ];
+        s[2] = pDct[kiI4] - pDct[kiI8 ];
+
+        pDct[i  ] = s[0] + s[1];
+        pDct[kiI8 ] = s[0] - s[1];
+        pDct[kiI4 ] = (s[3] << 1) + s[2];
+        pDct[kiI12] = s[3] - (s[2] << 1);
+    }
+}
+
+void WelsDctFourT4_c(int16_t *pDct, uint8_t *pPixel1, int32_t iStride1, uint8_t *pPixel2, int32_t iStride2 )
+{
+	int32_t stride_1 = iStride1 << 2;
+	int32_t stride_2 = iStride2 << 2;
+
+    WelsDctT4_c( pDct,      &pPixel1[0],          iStride1, &pPixel2[0],          iStride2	);
+	WelsDctT4_c( pDct + 16, &pPixel1[4],          iStride1, &pPixel2[4],          iStride2	);
+	WelsDctT4_c( pDct + 32, &pPixel1[stride_1  ], iStride1, &pPixel2[stride_2  ], iStride2	);
+	WelsDctT4_c( pDct + 48, &pPixel1[stride_1+4], iStride1, &pPixel2[stride_2+4], iStride2	);
+}
+
+/****************************************************************************
+ * Scan and Score functions
+ ****************************************************************************/
+void WelsScan4x4DcAc_c( int16_t* pLevel, int16_t *pDct )
+{
+	ST32( pLevel, LD32(pDct) );	
+	pLevel[2] = pDct[4];
+	pLevel[3] = pDct[8];
+	pLevel[4] = pDct[5];
+	ST32( pLevel+5, LD32(pDct+2) );	
+	pLevel[7] = pDct[6];
+	pLevel[8] = pDct[9];
+	ST32( pLevel+9, LD32(pDct+12) );	
+	pLevel[11] = pDct[10];
+	pLevel[12] = pDct[7];
+	pLevel[13] = pDct[11];
+	ST32( pLevel+14, LD32(pDct+14) );
+}
+
+void WelsScan4x4Ac_c( int16_t* pLevel, int16_t* pDct )
+{
+	pLevel[0]  = pDct[1];
+	pLevel[1]  = pDct[4];
+	pLevel[2]  = pDct[8];
+	pLevel[3]  = pDct[5];
+	ST32( &pLevel[4], LD32(&pDct[2]) );	
+	pLevel[6]  = pDct[6];	
+	pLevel[7]  = pDct[9];
+	ST32( &pLevel[8], LD32(&pDct[12]) );	
+	pLevel[10] = pDct[10];
+	pLevel[11] = pDct[7];
+	pLevel[12] = pDct[11];
+	ST32( &pLevel[13], LD32(&pDct[14]) );
+	pLevel[15] = 0;
+}
+
+void WelsScan4x4Dc( int16_t* pLevel, int16_t* pDct )
+{
+	ST32( pLevel, LD32(pDct) );	
+	pLevel[2] = pDct[4];	
+	pLevel[3] = pDct[8];	
+	pLevel[4] = pDct[5];	
+	ST32( pLevel+5, LD32(pDct+2) );	
+	pLevel[7] = pDct[6];	
+	pLevel[8] = pDct[9];	
+	ST32( pLevel+9, LD32(pDct+12) );	
+	pLevel[11] = pDct[10];
+	pLevel[12] = pDct[7];	
+	pLevel[13] = pDct[11];
+	ST32( pLevel+14, LD32(pDct+14) );	
+}
+
+//refer to JVT-O079
+int32_t WelsCalculateSingleCtr4x4_c( int16_t *pDct)
+{
+    static const int32_t kiTRunTable[16] = { 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+    int32_t iSingleCtr = 0;
+    int32_t iIdx = 15;
+    int32_t iRun;
+
+    while( iIdx >= 0 && pDct[iIdx] == 0 )      --iIdx;
+
+    while( iIdx >= 0 )
+    {
+		-- iIdx;        
+		iRun = iIdx;
+        while( iIdx >= 0 && pDct[iIdx] == 0 )  --iIdx;            
+		iRun -= iIdx;
+        iSingleCtr += kiTRunTable[iRun];
+    }
+    return iSingleCtr;
+}
+
+/****************************************************************************
+ * Copy functions 
+ ****************************************************************************/
+void WelsCopy4x4( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
+{
+	const int32_t kiSrcStride2 = iStrideS << 1;
+	const int32_t kiSrcStride3 = iStrideS + kiSrcStride2;
+	const int32_t kiDstStride2 = iStrideD << 1;
+	const int32_t kiDstStride3 = iStrideD + kiDstStride2;
+
+	ST32( pDst,				LD32(pSrc) );
+	ST32( pDst+iStrideD,	LD32(pSrc+iStrideS) );
+	ST32( pDst+kiDstStride2, LD32(pSrc+kiSrcStride2) );
+	ST32( pDst+kiDstStride3, LD32(pSrc+kiSrcStride3) );
+}
+void WelsCopy8x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
+{
+	int32_t i;
+	for( i = 0; i < 4; i++)
+	{
+		ST32( pDst,				LD32(pSrc			));	ST32( pDst + 4 ,			LD32(pSrc + 4			));
+		ST32( pDst + iStrideD,	LD32(pSrc + iStrideS));	ST32( pDst + iStrideD + 4 ,	LD32(pSrc + iStrideS + 4));
+		pDst += iStrideD << 1;
+		pSrc += iStrideS << 1;
+	}
+}
+void WelsCopy8x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
+{
+	int32_t i;
+	for( i = 0; i < 8; ++i )
+	{
+		ST32( pDst,				LD32(pSrc			));	ST32( pDst + 4 ,			LD32(pSrc + 4			));
+		ST32( pDst + iStrideD,	LD32(pSrc + iStrideS));	ST32( pDst + iStrideD + 4 ,	LD32(pSrc + iStrideS + 4));
+		pDst += iStrideD << 1;
+		pSrc += iStrideS << 1;
+	}
+}
+void WelsCopy16x8_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
+{
+	int32_t i;
+	for( i = 0; i < 8; i++)
+	{
+		ST32( pDst,		LD32(pSrc		));	ST32( pDst + 4 ,	LD32(pSrc + 4 ));
+		ST32( pDst + 8 ,LD32(pSrc + 8	));	ST32( pDst + 12 ,	LD32(pSrc + 12));
+		pDst += iStrideD ;
+		pSrc += iStrideS;
+	}
+}
+void WelsCopy16x16_c( uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS )
+{
+	int32_t i;
+	for( i = 0; i < 16; i++)
+	{
+		ST32( pDst,		LD32(pSrc		));	ST32( pDst + 4 ,	LD32(pSrc + 4 ));
+		ST32( pDst + 8 ,LD32(pSrc + 8	));	ST32( pDst + 12 ,	LD32(pSrc + 12));
+		pDst += iStrideD ;
+		pSrc += iStrideS;
+	}
+}
+
+int32_t WelsGetNoneZeroCount_c(int16_t * pLevel)
+{
+	int32_t iCnt = 0; 
+	int32_t iIdx = 0;
+
+	while (iIdx < 16) {
+		iCnt += (pLevel[  iIdx] == 0);
+		iCnt += (pLevel[1+iIdx] == 0);
+		iCnt += (pLevel[2+iIdx] == 0);
+		iCnt += (pLevel[3+iIdx] == 0);
+
+		iIdx += 4;
+	}
+	return (16 - iCnt);
+}
+
+void WelsInitEncodingFuncs( SWelsFuncPtrList *pFuncList, uint32_t  uiCpuFlag )
+{
+	pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_c;
+	pFuncList->pfCopy16x16Aligned		=
+	pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16_c;
+	pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8_c;
+	pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_c;
+
+	pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_c;
+	pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_c;	
+	pFuncList->pfTransformHadamard4x4Dc			= WelsHadamardT4Dc_c;	
+
+	pFuncList->pfDctT4					= WelsDctT4_c;
+	pFuncList->pfDctFourT4   			= WelsDctFourT4_c;
+	
+	pFuncList->pfScan4x4				= WelsScan4x4DcAc_c;
+	pFuncList->pfScan4x4Ac				= WelsScan4x4Ac_c;
+	pFuncList->pfCalculateSingleCtr4x4	= WelsCalculateSingleCtr4x4_c;
+
+	pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_c;
+
+	pFuncList->pfQuantization4x4		= WelsQuant4x4_c;
+	pFuncList->pfQuantizationDc4x4		= WelsQuant4x4Dc_c;
+	pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_c;
+	pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_c;
+
+#if defined(X86_ASM)
+	if ( uiCpuFlag & WELS_CPU_MMXEXT )
+	{
+		
+		pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_mmx;
+		pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_mmx;	
+
+		pFuncList->pfDctT4					= WelsDctT4_mmx;
+
+		pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_mmx;
+		pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_mmx;
+	}
+	if ( uiCpuFlag & WELS_CPU_SSE2 )
+	{
+		pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_sse2;	
+		pFuncList->pfTransformHadamard4x4Dc	= WelsHadamardT4Dc_sse2;
+
+		pFuncList->pfQuantization4x4		= WelsQuant4x4_sse2;
+		pFuncList->pfQuantizationDc4x4		= WelsQuant4x4Dc_sse2;
+		pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_sse2;
+		pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_sse2;
+
+		pFuncList->pfCopy16x16Aligned		= WelsCopy16x16_sse2;
+		pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_sse2;
+		pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_sse2;
+
+		pFuncList->pfScan4x4				= WelsScan4x4DcAc_sse2;
+		pFuncList->pfScan4x4Ac				= WelsScan4x4Ac_sse2;
+		pFuncList->pfCalculateSingleCtr4x4	= WelsCalculateSingleCtr4x4_sse2;
+
+		pFuncList->pfDctFourT4				= WelsDctFourT4_sse2;		
+	}
+//#ifndef MACOS
+	if ( uiCpuFlag & WELS_CPU_SSSE3 )
+    {
+    	pFuncList->pfScan4x4				= WelsScan4x4DcAc_ssse3;
+	}
+
+//#endif//MACOS
+
+#endif//X86_ASM
+}
+}
--- /dev/null
+++ b/codec/encoder/core/src/encoder.cpp
@@ -1,0 +1,554 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	encoder.c
+ *
+ * \brief	core encoder
+ *
+ * \date	5/14/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "encoder.h"
+#include "extern.h"
+#include "cpu.h"
+#include "cpu_core.h"
+#include "utils.h"
+
+#include "encode_mb_aux.h"
+#include "decode_mb_aux.h"
+#include "get_intra_predictor.h"
+#include "svc_encode_mb.h"
+
+#include "deblocking.h"
+#include "expand_pic.h"
+
+#include "mc.h"
+#include "sample.h"
+
+#include "svc_encode_slice.h"
+#include "svc_base_layer_md.h"
+#include "svc_mode_decision.h"
+#include "set_mb_syn_cavlc.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross_platforms
+#include "codec_def.h"
+#ifdef MT_ENABLED
+#include "slice_multi_threading.h"
+#endif//MT_ENABLED
+
+//  global   function  pointers  definition
+namespace WelsSVCEnc {
+/* Motion compensation */
+
+
+/*!
+ * \brief	initialize source picture body
+ * \param	pSrc		SSourcePicture*
+ * \param	csp		internal csp format
+ * \param	iWidth	widht of picture in pixels
+ * \param	iHeight	iHeight of picture in pixels
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t InitPic( const void *kpSrc, const int32_t kiColorspace, const int32_t kiWidth, const int32_t kiHeight )
+{
+	SSourcePicture *pSrcPic = (SSourcePicture *)kpSrc;
+
+	if ( NULL == pSrcPic || kiWidth == 0 || kiHeight == 0 )
+		return 1;
+
+	pSrcPic->iColorFormat	= kiColorspace;
+	pSrcPic->iPicWidth		= kiWidth;
+	pSrcPic->iPicHeight		= kiHeight;
+	
+	switch( kiColorspace & (~videoFormatVFlip) ) {
+	case videoFormatI420:
+	case videoFormatYV12:
+		pSrcPic->pData[0]	= NULL;
+		pSrcPic->pData[1]	= NULL;
+		pSrcPic->pData[2]	= NULL;
+		pSrcPic->pData[3]	= NULL;
+		pSrcPic->iStride[0]	= kiWidth;
+		pSrcPic->iStride[2]	= pSrcPic->iStride[1] = kiWidth >> 1;
+		pSrcPic->iStride[3]	= 0;
+		break;	
+	case videoFormatYUY2:
+	case videoFormatYVYU:
+	case videoFormatUYVY:
+		pSrcPic->pData[0]	= NULL;
+		pSrcPic->pData[1]	= NULL;
+		pSrcPic->pData[2]	= NULL;
+		pSrcPic->pData[3]	= NULL;		
+		pSrcPic->iStride[0]	= CALC_BI_STRIDE(kiWidth,  16);
+		pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;		
+		break;
+	case videoFormatRGB:
+	case videoFormatBGR:
+		pSrcPic->pData[0]	= NULL;
+		pSrcPic->pData[1]	= NULL;
+		pSrcPic->pData[2]	= NULL;
+		pSrcPic->pData[3]	= NULL;		
+		pSrcPic->iStride[0]	= CALC_BI_STRIDE(kiWidth, 24);
+		pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;
+		if( kiColorspace & videoFormatVFlip )
+			pSrcPic->iColorFormat = kiColorspace & (~videoFormatVFlip);
+		else 
+			pSrcPic->iColorFormat = kiColorspace | videoFormatVFlip;
+		break;
+	case videoFormatBGRA:
+	case videoFormatRGBA:
+	case videoFormatARGB:
+	case videoFormatABGR:
+		pSrcPic->pData[0]	= NULL;
+		pSrcPic->pData[1]	= NULL;
+		pSrcPic->pData[2]	= NULL;
+		pSrcPic->pData[3]	= NULL;		
+		pSrcPic->iStride[0]	= kiWidth << 2;
+		pSrcPic->iStride[3]	= pSrcPic->iStride[2] = pSrcPic->iStride[1] = 0;	
+		if( kiColorspace & videoFormatVFlip )
+			pSrcPic->iColorFormat = kiColorspace & (~videoFormatVFlip);
+		else 
+			pSrcPic->iColorFormat = kiColorspace | videoFormatVFlip;
+		break;
+	default:
+		return 2;	// any else?
+	}
+
+	return 0;
+}
+
+
+void WelsInitBGDFunc( SWelsFuncPtrList *pFuncList, const bool_t kbEnableBackgroundDetection )
+{
+	if ( kbEnableBackgroundDetection )
+	{
+		 pFuncList->pfInterMdBackgroundDecision = (PInterMdBackgroundDecisionFunc)WelsMdInterJudgeBGDPskip;
+		 pFuncList->pfInterMdBackgroundInfoUpdate = (PInterMdBackgroundInfoUpdateFunc)WelsMdInterUpdateBGDInfo;
+	}
+	else
+	{
+		 pFuncList->pfInterMdBackgroundDecision = (PInterMdBackgroundDecisionFunc)WelsMdInterJudgeBGDPskipFalse;
+		 pFuncList->pfInterMdBackgroundInfoUpdate = (PInterMdBackgroundInfoUpdateFunc)WelsMdInterUpdateBGDInfoNULL;
+	}
+}
+
+/*!
+ * \brief	initialize function pointers that potentially used in Wels encoding
+ * \param	pEncCtx		sWelsEncCtx*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t InitFunctionPointers( SWelsFuncPtrList *pFuncList, SWelsSvcCodingParam *pParam, uint32_t uiCpuFlag )
+{	
+	int32_t iReturn = 0;
+
+	/* Functionality utilization of CPU instructions dependency */
+	pFuncList->pfSetMemZeroSize8	= WelsSetMemZero_c;		// confirmed_safe_unsafe_usage
+	pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZero_c;	// confirmed_safe_unsafe_usage
+	pFuncList->pfSetMemZeroSize64	= WelsSetMemZero_c;	// confirmed_safe_unsafe_usage
+#if defined(X86_ASM)
+	if ( uiCpuFlag & WELS_CPU_MMXEXT )
+	{		
+		pFuncList->pfSetMemZeroSize8	= WelsSetMemZeroSize8_mmx;		// confirmed_safe_unsafe_usage
+		pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZeroSize64_mmx;	// confirmed_safe_unsafe_usage
+		pFuncList->pfSetMemZeroSize64	= WelsSetMemZeroSize64_mmx;	// confirmed_safe_unsafe_usage
+	}
+	if ( uiCpuFlag & WELS_CPU_SSE2 )
+	{
+		pFuncList->pfSetMemZeroSize64Aligned16	= WelsSetMemZeroAligned64_sse2;	// confirmed_safe_unsafe_usage
+	}
+#endif//X86_ASM
+
+	InitExpandPictureFunc( pFuncList, uiCpuFlag );
+
+	/* Intra_Prediction_fn*/	
+	WelsInitFillingPredFuncs( uiCpuFlag );
+	WelsInitIntraPredFuncs( pFuncList, uiCpuFlag );
+
+	/* sad, satd, average */
+	WelsInitSampleSadFunc(pFuncList, uiCpuFlag);
+
+	//
+	WelsInitBGDFunc(pFuncList, pParam->bEnableBackgroundDetection );
+	// for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
+	InitIntraAnalysisVaaInfo( pFuncList, uiCpuFlag );
+	
+	/* Motion compensation */
+	/*init pixel average function*/
+	/*get one column or row pixel when refinement*/
+	WelsInitMcFuncs(pFuncList, uiCpuFlag);
+	InitCoeffFunc( uiCpuFlag );
+
+	WelsInitEncodingFuncs( pFuncList, uiCpuFlag );
+	WelsInitReconstructionFuncs( pFuncList, uiCpuFlag );
+
+	DeblockingInit( &pFuncList->pfDeblocking, uiCpuFlag );
+	WelsBlockFuncInit( &pFuncList->pfSetNZCZero, uiCpuFlag );
+
+	InitFillNeighborCacheInterFunc ( pFuncList, pParam->bEnableBackgroundDetection );
+
+	return iReturn;
+}
+
+/*!
+ * \brief	initialize frame coding	
+ */
+void InitFrameCoding( sWelsEncCtx *pEncCtx, const EFrameType keFrameType )
+{
+	// for bitstream writing
+	pEncCtx->iPosBsBuffer		= 0;	// reset bs pBuffer position
+	pEncCtx->pOut->iNalIndex		= 0;	// reset NAL index
+	
+	InitBits( &pEncCtx->pOut->sBsWrite, pEncCtx->pOut->pBsBuffer, pEncCtx->pOut->uiSize );
+
+	if ( keFrameType == WELS_FRAME_TYPE_P )
+	{
+		if ( pEncCtx->pSvcParam->uiIntraPeriod )
+		{
+			++pEncCtx->iFrameIndex;
+		}
+		
+		++pEncCtx->uiFrameIdxRc;
+
+		if ( pEncCtx->iPOC < ( 1 << pEncCtx->pSps->iLog2MaxPocLsb ) - 2 ) // if iPOC type is no 0, this need be modification
+			pEncCtx->iPOC			+= 2;	// for POC type 0
+		else
+			pEncCtx->iPOC = 0;
+		
+		if ( pEncCtx->eLastNalPriority != 0 )
+		{
+			if ( pEncCtx->iFrameNum < (1 << pEncCtx->pSps->uiLog2MaxFrameNum) - 1  )
+				++ pEncCtx->iFrameNum;
+			else
+				pEncCtx->iFrameNum	= 0;	// if iFrameNum overflow
+		}
+		pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE;
+		pEncCtx->eSliceType	= P_SLICE;
+		pEncCtx->eNalPriority	= NRI_PRI_HIGH;
+	}
+	else if ( keFrameType == WELS_FRAME_TYPE_IDR )
+	{
+		pEncCtx->iFrameNum		= 0;
+		pEncCtx->iPOC			= 0;
+		pEncCtx->bEncCurFrmAsIdrFlag = false;
+		if ( pEncCtx->pSvcParam->uiIntraPeriod )
+		{
+			pEncCtx->iFrameIndex = 0;
+		}		
+		pEncCtx->uiFrameIdxRc = 0;
+
+		pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE_IDR;
+		pEncCtx->eSliceType	= I_SLICE;
+		pEncCtx->eNalPriority	= NRI_PRI_HIGHEST;
+
+		pEncCtx->iCodingIndex	= 0;
+
+		// reset_ref_list
+
+		// rc_init_gop		
+	}
+	else if ( keFrameType == WELS_FRAME_TYPE_I )
+	{
+		if ( pEncCtx->iPOC < ( 1 << pEncCtx->pSps->iLog2MaxPocLsb ) - 2 ) // if iPOC type is no 0, this need be modification
+			pEncCtx->iPOC			+= 2;	// for POC type 0
+		else
+			pEncCtx->iPOC = 0;
+		
+		if ( pEncCtx->eLastNalPriority != 0 )
+		{
+			if ( pEncCtx->iFrameNum < (1 << pEncCtx->pSps->uiLog2MaxFrameNum) - 1  )
+				++ pEncCtx->iFrameNum;
+			else
+				pEncCtx->iFrameNum	= 0;	// if iFrameNum overflow
+		}
+
+		pEncCtx->eNalType		= NAL_UNIT_CODED_SLICE;
+		pEncCtx->eSliceType	= I_SLICE;
+		pEncCtx->eNalPriority	= NRI_PRI_HIGHEST;
+
+		// rc_init_gop
+	}
+	else	// B pictures are not supported now, any else?
+	{
+		assert( 0 );
+	}
+
+#if defined(STAT_OUTPUT)
+	memset( &pEncCtx->sPerInfo, 0, sizeof(SStatSliceInfo) );
+#endif//FRAME_INFO_OUTPUT
+
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+	if ( pEncCtx->pSvcParam->iMultipleThreadIdc > 1 )
+		reset_env_mt( pEncCtx );
+#endif
+}
+
+EFrameType DecideFrameType( sWelsEncCtx *pEncCtx, const int8_t kiSpatialNum )
+{	
+	SWelsSvcCodingParam *pSvcParam	= pEncCtx->pSvcParam;
+	EFrameType iFrameType = WELS_FRAME_TYPE_AUTO;
+	bool_t bSceneChangeFlag = false;
+	
+	// perform scene change detection	
+	if ( (!pSvcParam->bEnableSceneChangeDetect) || pEncCtx->pVaa->bIdrPeriodFlag || 
+		(kiSpatialNum < pSvcParam->iNumDependencyLayer) || (pEncCtx->uiFrameIdxRc < (VGOP_SIZE << 1)) ) // avoid too frequent I frame coding, rc control 
+	{
+		bSceneChangeFlag = false;
+	}
+	else
+	{
+		bSceneChangeFlag = pEncCtx->pVaa->bSceneChangeFlag;
+	}
+
+	//scene_changed_flag: RC enable && iSpatialNum == pSvcParam->iNumDependencyLayer 
+	//bIdrPeriodFlag: RC disable || iSpatialNum != pSvcParam->iNumDependencyLayer
+	//pEncCtx->bEncCurFrmAsIdrFlag: 1. first frame should be IDR; 2. idr pause; 3. idr request
+	iFrameType = ( pEncCtx->pVaa->bIdrPeriodFlag || bSceneChangeFlag || pEncCtx->bEncCurFrmAsIdrFlag ) ? WELS_FRAME_TYPE_IDR : WELS_FRAME_TYPE_P;
+
+	if (  WELS_FRAME_TYPE_P == iFrameType && pEncCtx->iSkipFrameFlag > 0 ) // for frame skip, 1/5/2010
+	{
+		-- pEncCtx->iSkipFrameFlag;
+		iFrameType = WELS_FRAME_TYPE_SKIP;
+	}
+	else if ( WELS_FRAME_TYPE_IDR == iFrameType )
+	{
+		pEncCtx->iCodingIndex = 0;
+	}
+
+	return iFrameType;
+}
+
+/*!
+ * \brief	Dump reconstruction for dependency layer
+ */
+
+extern "C" void DumpDependencyRec( SPicture *pCurPicture, const str_t *kpFileName, const int8_t kiDid )
+{
+	FILE *pDumpRecFile											= NULL;	
+	static bool_t bDependencyRecFlag[MAX_DEPENDENCY_LAYER]	= {0};
+	int32_t iWrittenSize											= 0;
+
+	if ( NULL == pCurPicture || NULL == kpFileName || kiDid >= MAX_DEPENDENCY_LAYER )
+		return;
+	
+	if ( bDependencyRecFlag[kiDid] )
+	{
+		if ( STRNLEN(kpFileName, MAX_FNAME_LEN) > 0 )	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__) || (defined(WIN32) && defined(_MSC_VER) && (_MSC_VER<1500))
+			pDumpRecFile	= FOPEN( kpFileName, "ab" );
+#elif defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+			FOPEN(&pDumpRecFile, kpFileName, "ab");
+#endif//__GNUC__..
+		else
+		{
+			str_t sDependencyRecFileName[16] = {0};			
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+			SNPRINTF( sDependencyRecFileName, 16, 16, "rec%d.yuv", kiDid );	// confirmed_safe_unsafe_usage
+			FOPEN( &pDumpRecFile, sDependencyRecFileName, "ab" );
+#else
+			SNPRINTF( sDependencyRecFileName, 16, "rec%d.yuv", kiDid );	// confirmed_safe_unsafe_usage
+			pDumpRecFile	= FOPEN( sDependencyRecFileName, "ab" );
+#endif//WIN32..
+		}
+		if ( NULL != pDumpRecFile)
+			fseek( pDumpRecFile, 0, SEEK_END );
+	}
+	else
+	{
+		if ( STRNLEN(kpFileName, MAX_FNAME_LEN) > 0 )	// confirmed_safe_unsafe_usage
+		{
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+			FOPEN(&pDumpRecFile, kpFileName, "wb");
+#else
+			pDumpRecFile	= FOPEN( kpFileName, "wb" );
+#endif//WIN32..
+		}
+		else
+		{
+			str_t sDependencyRecFileName[16] = {0};
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+			SNPRINTF( sDependencyRecFileName, 16, 16, "rec%d.yuv", kiDid );	// confirmed_safe_unsafe_usage
+			FOPEN(&pDumpRecFile, sDependencyRecFileName, "wb");
+#else
+			SNPRINTF( sDependencyRecFileName, 16, "rec%d.yuv", kiDid );	// confirmed_safe_unsafe_usage
+			pDumpRecFile	= FOPEN( sDependencyRecFileName, "wb");
+#endif//WIN32..
+		}
+		bDependencyRecFlag[kiDid]	= true;
+	}
+
+	if ( NULL != pDumpRecFile )
+	{
+		int32_t i = 0;
+		int32_t j = 0;
+		const int32_t kiStrideY	= pCurPicture->iLineSize[0];		
+		const int32_t kiLumaWidth	= pCurPicture->iWidthInPixel;
+		const int32_t kiLumaHeight	= pCurPicture->iHeightInPixel;
+		const int32_t kiChromaWidth	= kiLumaWidth >> 1;
+		const int32_t kiChromaHeight	= kiLumaHeight >> 1;		
+		
+		for( j = 0; j < kiLumaHeight; ++ j)
+		{
+			iWrittenSize = fwrite( &pCurPicture->pData[0][j*kiStrideY], 1, kiLumaWidth, pDumpRecFile );
+			assert( iWrittenSize == kiLumaWidth );
+			if ( iWrittenSize < kiLumaWidth )
+			{
+				assert( 0 );	// make no sense for us if writing failed
+				fclose(pDumpRecFile);
+				return;
+			}
+		}
+		for( i = 1; i < I420_PLANES; ++ i)
+		{
+			const int32_t kiStrideUV = pCurPicture->iLineSize[i];			
+			for ( j = 0; j < kiChromaHeight; ++ j)
+			{
+				iWrittenSize = fwrite( &pCurPicture->pData[i][j*kiStrideUV], 1, kiChromaWidth, pDumpRecFile );
+				assert(iWrittenSize == kiChromaWidth );
+				if ( iWrittenSize < kiChromaWidth )
+				{
+					assert( 0 );	// make no sense for us if writing failed
+					fclose(pDumpRecFile);
+					return;
+				}
+			}
+		}
+		fclose(pDumpRecFile);
+		pDumpRecFile = NULL;
+	}
+}
+
+/*!
+ * \brief	Dump the reconstruction pictures
+ */
+
+void DumpRecFrame( SPicture *pCurPicture, const str_t *kpFileName )
+{
+	FILE *pDumpRecFile				= NULL;	
+	static bool_t bRecFlag	= false;
+	int32_t iWrittenSize			= 0;
+
+	if ( NULL == pCurPicture || NULL == kpFileName )
+		return;
+	
+	if ( bRecFlag )
+	{
+		if ( STRNLEN(kpFileName, MAX_FNAME_LEN) > 0 )	// confirmed_safe_unsafe_usage
+		{
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+			FOPEN(&pDumpRecFile, kpFileName, "ab");
+#else
+			pDumpRecFile	= FOPEN( kpFileName, "ab" );
+#endif//WIN32
+		}
+		else
+		{
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+			FOPEN(&pDumpRecFile, "rec.yuv", "ab");
+#else
+			pDumpRecFile	= FOPEN( "rec.yuv", "ab" );
+#endif//WIN32
+		}
+		if ( NULL != pDumpRecFile)
+			fseek( pDumpRecFile, 0, SEEK_END );
+	}
+	else
+	{
+		if ( STRNLEN(kpFileName, MAX_FNAME_LEN) > 0 )	// confirmed_safe_unsafe_usage
+		{
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+			FOPEN(&pDumpRecFile, kpFileName, "wb");
+#else
+			pDumpRecFile	= FOPEN( kpFileName, "wb" );
+#endif//WIN32
+		}
+		else
+		{
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER>=1500)	// vs2008
+			FOPEN(&pDumpRecFile, "rec.yuv", "wb");
+#else
+			pDumpRecFile	= FOPEN( "rec.yuv", "wb");
+#endif//WIN32..
+		}
+		bRecFlag	= true;
+	}
+
+	if ( NULL != pDumpRecFile )
+	{
+		int32_t i = 0;
+		int32_t j = 0;
+		const int32_t kiStrideY	= pCurPicture->iLineSize[0];		
+		const int32_t kiLumaWidth	= pCurPicture->iWidthInPixel;
+		const int32_t kiLumaHeight	= pCurPicture->iHeightInPixel;
+		const int32_t kiChromaWidth	= kiLumaWidth >> 1;
+		const int32_t kiChromaHeight	= kiLumaHeight >> 1;		
+		
+		for( j = 0; j < kiLumaHeight; ++ j)
+		{
+			iWrittenSize = fwrite( &pCurPicture->pData[0][j*kiStrideY], 1, kiLumaWidth, pDumpRecFile );
+			assert( iWrittenSize == kiLumaWidth );
+			if ( iWrittenSize < kiLumaWidth )
+			{
+				assert( 0 );	// make no sense for us if writing failed
+				fclose(pDumpRecFile);
+				return;
+			}
+		}
+		for( i = 1; i < I420_PLANES; ++ i)
+		{
+			const int32_t kiStrideUV = pCurPicture->iLineSize[i];			
+			for ( j = 0; j < kiChromaHeight; ++ j)
+			{
+				iWrittenSize = fwrite( &pCurPicture->pData[i][j*kiStrideUV], 1, kiChromaWidth, pDumpRecFile );
+				assert(iWrittenSize == kiChromaWidth );
+				if ( iWrittenSize < kiChromaWidth )
+				{
+					assert( 0 );	// make no sense for us if writing failed
+					fclose(pDumpRecFile);
+					return;
+				}
+			}
+		}
+		fclose(pDumpRecFile);
+		pDumpRecFile = NULL;
+	}
+}
+
+
+
+/***********************************************************************************/
+void WelsSetMemZero_c(void *pDst, int32_t iSize)	// confirmed_safe_unsafe_usage
+{
+	memset(pDst, 0, iSize);
+}
+}
--- /dev/null
+++ b/codec/encoder/core/src/encoder_data_tables.cpp
@@ -1,0 +1,475 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// exp_data.c
+// export date cross various modules (.c)
+#include "typedefs.h"
+#include "wels_common_basis.h"
+#include "mb_cache.h"
+#include "utils.h"
+#include "md.h"
+#include "sample.h"
+#include "svc_enc_golomb.h"
+#include "vlc_encoder.h"
+namespace WelsSVCEnc {
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at mb_cache.h
+const uint8_t g_kuiSmb4AddrIn256[16] = 
+{
+	0,		4,		16*4,		16*4+4,
+	8,		12,		16*4+8,		16*4+12,
+	16*8,	16*8+4,	16*12,		16*12+4,
+	16*8+8,  16*8+12,  16*12+8, 16*12+12
+};                       
+
+//////pNonZeroCount[16+8] mapping scan index
+const uint8_t g_kuiMbCountScan4Idx[24] =
+{                     //  0   1 | 4  5      luma 8*8 block           pNonZeroCount[16+8] 
+	0,  1,  4,  5,   //  2   3 | 6  7        0  |  1                  0   1   2   3 
+	2,  3,  6,  7,   //---------------      ---------                 4   5   6   7 
+	8,  9, 12, 13,   //  8   9 | 12 13       2  |  3                  8   9  10  11 
+	10, 11, 14, 15,   // 10  11 | 14 15-----------------------------> 12  13  14  15 
+	16, 17, 20, 21,   //----------------    chroma 8*8 block          16  17  18  19  
+	18, 19, 22, 23   // 16  17 | 20 21        0    1                 20  21  22  23 
+};
+
+const uint8_t g_kuiCache48CountScan4Idx[24] =
+{	// [16 + 2*4]
+	9, 10, 17, 18,	
+	11, 12, 19, 20,	
+	25, 26, 33, 34,	
+	27, 28, 35, 36,	
+	14, 15,			
+	22, 23,			
+	38, 39,			
+	46, 47			
+};	
+
+
+//cache element equal to 30
+const uint8_t g_kuiCache30ScanIdx[16] = //mv or uiRefIndex cache scan index, 4*4 block as basic unit
+{
+	7,  8, 13, 14,
+	9, 10, 15, 16,
+	19, 20, 25, 26,
+	21, 22, 27, 28
+};
+
+const uint8_t g_kuiCache12_8x8RefIdx[4] = //mv or uiRefIndex cache scan index, 4*4 block as basic unit
+{
+	5,6,
+	9, 10
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at mb_cache.h
+
+const str_t *g_sWelsLogTags[] = {
+	"ERR",
+	"WARN",
+	"INFO",
+	"DBUG",
+	"RESV"
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at wels_common_basis.h
+const uint8_t g_kuiChromaQpTable[52]={
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,
+	12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,
+	28,29,29,30,31,32,32,33,34,34,35,35,36,36,37,37,
+	37,38,38,38,39,39,39,39
+};
+
+/*
+ *	vcl type map for given NAL unit type and corresponding H264 type (0: AVC; 1: SVC).
+ */
+const EVclType g_keTypeMap[32][2] =
+{
+	{ NON_VCL,	NON_VCL },	// 0: NAL_UNIT_UNSPEC_0
+	{ VCL,		VCL,	},	// 1: NAL_UNIT_CODED_SLICE
+	{ VCL,		NOT_APP },	// 2: NAL_UNIT_CODED_SLICE_DPA
+	{ VCL,		NOT_APP },	// 3: NAL_UNIT_CODED_SLICE_DPB
+	{ VCL,		NOT_APP },	// 4: NAL_UNIT_CODED_SLICE_DPC
+	{ VCL,		VCL		},	// 5: NAL_UNIT_CODED_SLICE_IDR
+	{ NON_VCL,	NON_VCL },	// 6: NAL_UNIT_SEI
+	{ NON_VCL,	NON_VCL },	// 7: NAL_UNIT_SPS
+	{ NON_VCL,	NON_VCL },	// 8: NAL_UNIT_PPS
+	{ NON_VCL,	NON_VCL },	// 9: NAL_UNIT_AU_DELIMITER
+	{ NON_VCL,	NON_VCL },	// 10: NAL_UNIT_END_OF_SEQ
+	{ NON_VCL,	NON_VCL },	// 11: NAL_UNIT_END_OF_STR
+	{ NON_VCL,	NON_VCL	},	// 12: NAL_UNIT_FILLER_DATA
+	{ NON_VCL,	NON_VCL },	// 13: NAL_UNIT_SPS_EXT
+	{ NON_VCL,	NON_VCL },	// 14: NAL_UNIT_PREFIX, NEED associate succeeded NAL to make a VCL
+	{ NON_VCL,	NON_VCL },	// 15: NAL_UNIT_SUBSET_SPS
+	{ NON_VCL,	NON_VCL },	// 16: NAL_UNIT_RESV_16
+	{ NON_VCL,	NON_VCL },	// 17: NAL_UNIT_RESV_17
+	{ NON_VCL,	NON_VCL },	// 18: NAL_UNIT_RESV_18
+	{ NON_VCL,	NON_VCL },	// 19: NAL_UNIT_AUX_CODED_SLICE
+	{ NON_VCL,	VCL		},	// 20: NAL_UNIT_CODED_SLICE_EXT
+	{ NON_VCL,	NON_VCL },	// 21: NAL_UNIT_RESV_21
+	{ NON_VCL,	NON_VCL },	// 22: NAL_UNIT_RESV_22
+	{ NON_VCL,	NON_VCL },	// 23: NAL_UNIT_RESV_23
+	{ NON_VCL,	NON_VCL },	// 24: NAL_UNIT_UNSPEC_24
+	{ NON_VCL,	NON_VCL },	// 25: NAL_UNIT_UNSPEC_25
+	{ NON_VCL,	NON_VCL },	// 26: NAL_UNIT_UNSPEC_26
+	{ NON_VCL,	NON_VCL	},	// 27: NAL_UNIT_UNSPEC_27
+	{ NON_VCL,	NON_VCL },	// 28: NAL_UNIT_UNSPEC_28
+	{ NON_VCL,	NON_VCL },	// 29: NAL_UNIT_UNSPEC_29
+	{ NON_VCL,	NON_VCL },	// 30: NAL_UNIT_UNSPEC_30
+	{ NON_VCL,	NON_VCL }	// 31: NAL_UNIT_UNSPEC_31
+};
+
+__align16( const uint16_t, g_kuiDequantCoeff[52][8]) = {
+/* 0*/{   10,   13,   10,   13,   13,   16,   13,   16 },	/* 1*/{   11,   14,   11,   14,   14,   18,   14,   18 },
+/* 2*/{   13,   16,   13,   16,   16,   20,   16,   20 },	/* 3*/{   14,   18,   14,   18,   18,   23,   18,   23 },
+/* 4*/{   16,   20,   16,   20,   20,   25,   20,   25 },	/* 5*/{   18,   23,   18,   23,   23,   29,   23,   29 },
+/* 6*/{   20,   26,   20,   26,   26,   32,   26,   32 },	/* 7*/{   22,   28,   22,   28,   28,   36,   28,   36 },
+/* 8*/{   26,   32,   26,   32,   32,   40,   32,   40 },	/* 9*/{   28,   36,   28,   36,   36,   46,   36,   46 },
+/*10*/{   32,   40,   32,   40,   40,   50,   40,   50 },	/*11*/{   36,   46,   36,   46,   46,   58,   46,   58 },
+/*12*/{   40,   52,   40,   52,   52,   64,   52,   64 },	/*13*/{   44,   56,   44,   56,   56,   72,   56,   72 },
+/*14*/{   52,   64,   52,   64,   64,   80,   64,   80 },	/*15*/{   56,   72,   56,   72,   72,   92,   72,   92 },
+/*16*/{   64,   80,   64,   80,   80,  100,   80,  100 },	/*17*/{   72,   92,   72,   92,   92,  116,   92,  116 },
+/*18*/{   80,  104,   80,  104,  104,  128,  104,  128 },	/*19*/{   88,  112,   88,  112,  112,  144,  112,  144 },
+/*20*/{  104,  128,  104,  128,  128,  160,  128,  160 },	/*21*/{  112,  144,  112,  144,  144,  184,  144,  184 },
+/*22*/{  128,  160,  128,  160,  160,  200,  160,  200 },	/*23*/{  144,  184,  144,  184,  184,  232,  184,  232 },
+/*24*/{  160,  208,  160,  208,  208,  256,  208,  256 },	/*25*/{  176,  224,  176,  224,  224,  288,  224,  288 },
+/*26*/{  208,  256,  208,  256,  256,  320,  256,  320 },	/*27*/{  224,  288,  224,  288,  288,  368,  288,  368 },
+/*28*/{  256,  320,  256,  320,  320,  400,  320,  400 },	/*29*/{  288,  368,  288,  368,  368,  464,  368,  464 },
+/*30*/{  320,  416,  320,  416,  416,  512,  416,  512 },	/*31*/{  352,  448,  352,  448,  448,  576,  448,  576 },
+/*32*/{  416,  512,  416,  512,  512,  640,  512,  640 },	/*33*/{  448,  576,  448,  576,  576,  736,  576,  736 },
+/*34*/{  512,  640,  512,  640,  640,  800,  640,  800 },	/*35*/{  576,  736,  576,  736,  736,  928,  736,  928 },
+/*36*/{  640,  832,  640,  832,  832, 1024,  832, 1024 },	/*37*/{  704,  896,  704,  896,  896, 1152,  896, 1152 },
+/*38*/{  832, 1024,  832, 1024, 1024, 1280, 1024, 1280 },	/*39*/{  896, 1152,  896, 1152, 1152, 1472, 1152, 1472 },
+/*40*/{ 1024, 1280, 1024, 1280, 1280, 1600, 1280, 1600 },	/*41*/{ 1152, 1472, 1152, 1472, 1472, 1856, 1472, 1856 },
+/*42*/{ 1280, 1664, 1280, 1664, 1664, 2048, 1664, 2048 },	/*43*/{ 1408, 1792, 1408, 1792, 1792, 2304, 1792, 2304 },
+/*44*/{ 1664, 2048, 1664, 2048, 2048, 2560, 2048, 2560 },	/*45*/{ 1792, 2304, 1792, 2304, 2304, 2944, 2304, 2944 },
+/*46*/{ 2048, 2560, 2048, 2560, 2560, 3200, 2560, 3200 },	/*47*/{ 2304, 2944, 2304, 2944, 2944, 3712, 2944, 3712 },
+/*48*/{ 2560, 3328, 2560, 3328, 3328, 4096, 3328, 4096 },	/*49*/{ 2816, 3584, 2816, 3584, 3584, 4608, 3584, 4608 },
+/*50*/{ 3328, 4096, 3328, 4096, 4096, 5120, 4096, 5120 },	/*51*/{ 3584, 4608, 3584, 4608, 4608, 5888, 4608, 5888 },
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at md.h
+const int32_t g_kiQpCostTable[52] = 
+{
+	1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
+	1, 1, 1, 1,              /*  8-11 */
+	1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
+	3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
+	6, 7, 8, 9,10,11,13,14,  /* 28-35 */
+	16,18,20,23,25,29,32,36,  /* 36-43 */
+	40,45,51,57,64,72,81,91   /* 44-51 */
+};
+const int8_t g_kiMapModeI16x16[7] = 
+{
+	0, 1, 2, 3, 2, 2, 2
+};//{I16_PRED_V, I16_PRED_H, I16_PRED_DC, I16_PRED_P, I16_PRED_DC, I16_PRED_DC, I16_PRED_DC};
+
+const int8_t g_kiMapModeIntraChroma[7] = 
+{
+	0, 1, 2, 3, 0, 0, 0
+};//{C_PRED_DC, C_PRED_H, C_PRED_V, C_PRED_P, C_PRED_DC_L, C_PRED_DC_T, C_PRED_DC_128};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at svc_enc_golomb.h
+
+const uint32_t g_uiGolombUELength[256] =
+{
+	1,  3,  3,  5,  5,  5,  5,  7,  7,  7,  7,  7,  7,  7,  7,    //14
+	9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, //30
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, //46
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, //62
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, //
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	17
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at vlc_encoder.h
+
+//g_kuiVlcCoeffToken[nc][total-coeff][trailing-ones][0--value, 1--bit count]
+const uint8_t g_kuiVlcCoeffToken[5][17][4][2] = 
+{
+	{//0<=nc<2
+		{	{ 1,  1}, { 0,  0}, { 0,  0}, { 0,  0} }, //0
+		{	{ 5,  6}, { 1,  2}, { 0,  0}, { 0,  0} },//1
+		{	{ 7,  8}, { 4,  6}, { 1,  3}, { 0,  0} },//2
+		{	{ 7,  9}, { 6,  8}, { 5,  7}, { 3,  5} },//3
+		{	{ 7, 10}, { 6,  9}, { 5,  8}, { 3,  6} },//4
+		{	{ 7, 11}, { 6, 10}, { 5,  9}, { 4,  7} },//5
+		{	{15, 13}, { 6, 11}, { 5, 10}, { 4,  8} },//6
+		{	{11, 13}, {14, 13}, { 5, 11}, { 4,  9} },//7
+		{	{ 8, 13}, {10, 13}, {13, 13}, { 4, 10} },//8
+		{	{15, 14}, {14, 14}, { 9, 13}, { 4, 11} },//9
+		{	{11, 14}, {10, 14}, {13, 14}, {12, 13} },//10
+		{	{15, 15}, {14, 15}, { 9, 14}, {12, 14} },//11
+		{	{11, 15}, {10, 15}, {13, 15}, { 8, 14} },//12
+		{	{15, 16}, { 1, 15}, { 9, 15}, {12, 15} },//13
+		{	{11, 16}, {14, 16}, {13, 16}, { 8, 15} },//14
+		{	{ 7, 16}, {10, 16}, { 9, 16}, {12, 16} },//15
+		{	{ 4, 16}, { 6, 16}, { 5, 16}, { 8, 16} }//16
+	},
+
+	{//2<=nc<4
+		{	{ 3,  2}, { 0,  0}, { 0,  0}, { 0,  0} },//0
+		{	{11,  6}, { 2,  2}, { 0,  0}, { 0,  0} },//1
+		{	{ 7,  6}, { 7,  5}, { 3,  3}, { 0,  0} },//2
+		{	{ 7,  7}, {10,  6}, { 9,  6}, { 5,  4} },//3
+		{	{ 7,  8}, { 6,  6}, { 5,  6}, { 4,  4} },//4
+		{	{ 4,  8}, { 6,  7}, { 5,  7}, { 6,  5} },//5
+		{	{ 7,  9}, { 6,  8}, { 5,  8}, { 8,  6} },//6
+		{	{15, 11}, { 6,  9}, { 5,  9}, { 4,  6} },//7
+		{	{11, 11}, {14, 11}, {13, 11}, { 4,  7} },//8
+		{	{15, 12}, {10, 11}, { 9, 11}, { 4,  9} },//9
+		{	{11, 12}, {14, 12}, {13, 12}, {12, 11} },//10
+		{	{ 8, 12}, {10, 12}, { 9, 12}, { 8, 11} },//11
+		{	{15, 13}, {14, 13}, {13, 13}, {12, 12} },//12
+		{	{11, 13}, {10, 13}, { 9, 13}, {12, 13} },//13
+		{	{ 7, 13}, {11, 14}, { 6, 13}, { 8, 13} },//14
+		{	{ 9, 14}, { 8, 14}, {10, 14}, { 1, 13} },//15
+		{	{ 7, 14}, { 6, 14}, { 5, 14}, { 4, 14} }//16
+	},
+
+	{//4<=nc<8
+		{	{15,  4}, { 0,  0}, { 0,  0}, { 0,  0} },//0
+		{	{15,  6}, {14,  4}, { 0,  0}, { 0,  0} },//1
+		{	{11,  6}, {15,  5}, {13,  4}, { 0,  0} },//2
+		{	{ 8,  6}, {12,  5}, {14,  5}, {12,  4} },//3
+		{	{15,  7}, {10,  5}, {11,  5}, {11,  4} },//4
+		{	{11,  7}, { 8,  5}, { 9,  5}, {10,  4} },//5
+		{	{ 9,  7}, {14,  6}, {13,  6}, { 9,  4} },//6
+		{	{ 8,  7}, {10,  6}, { 9,  6}, { 8,  4} },//7 
+		{	{15,  8}, {14,  7}, {13,  7}, {13,  5} },//8
+		{	{11,  8}, {14,  8}, {10,  7}, {12,  6} },//9
+		{	{15,  9}, {10,  8}, {13,  8}, {12,  7} },//10
+		{	{11,  9}, {14,  9}, { 9,  8}, {12,  8} },//11
+		{	{ 8,  9}, {10,  9}, {13,  9}, { 8,  8} },//12
+		{	{13, 10}, { 7,  9}, { 9,  9}, {12,  9} },//13
+		{	{ 9, 10}, {12, 10}, {11, 10}, {10, 10} },//14
+		{	{ 5, 10}, { 8, 10}, { 7, 10}, { 6, 10} },//15
+		{	{ 1, 10}, { 4, 10}, { 3, 10}, { 2, 10} }//16
+	},
+
+	{//8<=nc
+		{	{ 3,  6}, { 0,  0}, { 0,  0}, { 0,  0} },//0
+		{	{ 0,  6}, { 1,  6}, { 0,  0}, { 0,  0} },//1
+		{	{ 4,  6}, { 5,  6}, { 6,  6}, { 0,  0} },//2
+		{	{ 8,  6}, { 9,  6}, {10,  6}, {11,  6} },//3
+		{	{12,  6}, {13,  6}, {14,  6}, {15,  6} },//4
+		{	{16,  6}, {17,  6}, {18,  6}, {19,  6} },//5
+		{	{20,  6}, {21,  6}, {22,  6}, {23,  6} },//6
+		{	{24,  6}, {25,  6}, {26,  6}, {27,  6} },//7
+		{	{28,  6}, {29,  6}, {30,  6}, {31,  6} },//8
+		{	{32,  6}, {33,  6}, {34,  6}, {35,  6} },//9
+		{	{36,  6}, {37,  6}, {38,  6}, {39,  6} },//10
+		{	{40,  6}, {41,  6}, {42,  6}, {43,  6} },//11
+		{	{44,  6}, {45,  6}, {46,  6}, {47,  6} },//12
+		{	{48,  6}, {49,  6}, {50,  6}, {51,  6} },//13
+		{	{52,  6}, {53,  6}, {54,  6}, {55,  6} },//14
+		{	{56,  6}, {57,  6}, {58,  6}, {59,  6} },//15
+		{	{60,  6}, {61,  6}, {62,  6}, {63,  6} }//16
+	},
+
+	{//nc == -1
+		{	{ 1,  2}, { 0,  0}, { 0,  0}, { 0,  0} },//0
+		{	{ 7,  6}, { 1,  1}, { 0,  0}, { 0,  0} },//1
+		{	{ 4,  6}, { 6,  6}, { 1,  3}, { 0,  0} },//2
+		{	{ 3,  6}, { 3,  7}, { 2,  7}, { 5,  6} },//3
+		{	{ 2,  6}, { 3,  8}, { 2,  8}, { 0,  7} },//4
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//5
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//6
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//7
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//8
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//9
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//10
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//11
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//12
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//13
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//14
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} },//15
+		{	{ 0,  0}, { 0,  0}, { 0,  0}, { 0,  0} }//16
+	}
+};
+
+//const uint8_t g_kuiVlcLevelPrefix[15][2] =
+//{
+//	{1, 1}, {1, 2}
+//}; 
+
+//g_kuiVlcTotalZeros[tzVlcIndex][total_zeros][0--value, 1--bit count]
+const uint8_t g_kuiVlcTotalZeros[16][16][2] = 
+{
+	{//0 not available
+		{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} 	
+	},
+	{//1
+		{1, 1}, {3, 3}, {2, 3}, {3, 4}, {2, 4}, {3, 5}, {2, 5}, {3, 6}, {2, 6}, {3, 7}, {2, 7}, {3, 8}, {2, 8}, {3, 9}, {2, 9}, {1, 9}
+	},
+	{//2
+		{7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {5, 4}, {4, 4}, {3, 4}, {2, 4}, {3, 5}, {2, 5}, {3, 6}, {2, 6}, {1, 6}, {0, 6}, {0, 0}
+	},
+	{//3
+		{5, 4}, {7, 3}, {6, 3}, {5, 3}, {4, 4}, {3, 4}, {4, 3}, {3, 3}, {2, 4}, {3, 5}, {2, 5}, {1, 6}, {1, 5}, {0, 6}, {0, 0}, {0, 0}
+	},
+	{//4
+		{3, 5}, {7, 3}, {5, 4}, {4, 4}, {6, 3}, {5, 3}, {4, 3}, {3, 4}, {3, 3}, {2, 4}, {2, 5}, {1, 5}, {0, 5}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//5
+		{5, 4}, {4, 4}, {3, 4}, {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 4}, {1, 5}, {1, 4}, {0, 5}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//6
+		{1, 6}, {1, 5}, {7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 4}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//7
+		{1, 6}, {1, 5}, {5, 3}, {4, 3}, {3, 3}, {3, 2}, {2, 3}, {1, 4}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//8
+		{1, 6}, {1, 4}, {1, 5}, {3, 3}, {3, 2}, {2, 2}, {2, 3}, {1, 3}, {0, 6}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//9
+		{1, 6}, {0, 6}, {1, 4}, {3, 2}, {2, 2}, {1, 3}, {1, 2}, {1, 5}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//10
+		{1, 5}, {0, 5}, {1, 3}, {3, 2}, {2, 2}, {1, 2}, {1, 4}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//11
+		{0, 4}, {1, 4}, {1, 3}, {2, 3}, {1, 1}, {3, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//12
+		{0, 4}, {1, 4}, {1, 2}, {1, 1}, {1, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//13
+		{0, 3}, {1, 3}, {1, 1}, {1, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//14
+		{0, 2}, {1, 2}, {1, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//15
+		{0, 1}, {1, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	}
+};
+
+const uint8_t g_kuiVlcTotalZerosChromaDc[4][4][2] =
+{
+	{
+		{0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{
+		{1, 1}, {1, 2}, {1, 3}, {0, 3}
+	},
+	{
+		{1, 1}, {1, 2}, {0, 2}, {0, 0} 
+	},
+	{
+		{1, 1}, {0, 1}, {0, 0}, {0, 0}
+	}
+};
+//
+
+//g_kuiVlcRunBefore[zeros-left][run-before][0--value, 1--bit count]
+const uint8_t g_kuiVlcRunBefore[8][15][2] = 
+{
+	{//0 not available
+		{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0} 	
+	},
+	{//1
+		{1, 1}, {0, 1}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//2
+		{1, 1}, {1, 2}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//3
+		{3, 2}, {2, 2}, {1, 2}, {0, 2}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//4
+		{3, 2}, {2, 2}, {1, 2}, {1, 3}, {0, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//5
+		{3, 2}, {2, 2}, {3, 3}, {2, 3}, {1, 3}, {0, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//6
+		{3, 2}, {0, 3}, {1, 3}, {3, 3}, {2, 3}, {5, 3}, {4, 3}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}
+	},
+	{//>6
+		{7, 3}, {6, 3}, {5, 3}, {4, 3}, {3, 3}, {2, 3}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, {1, 9}, {1, 10}, {1, 11}
+	}
+};
+
+const ALIGNED_DECLARE(uint8_t, g_kuiEncNcMapTable[18], 16) =
+{
+	0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4
+};
+
+
+
+const uint8_t   g_kuiTemporalIdListTable[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE + 1] = 
+{
+	{  0, 0, 0, 0, 0, 0, 0, 0,
+	   0, 0, 0, 0, 0, 0, 0, 0,
+	   0  },  // gop size = 1
+	{  0, 1, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0  },  // uiGopSize = 2
+	{  0, 2, 1, 2, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0  },  // uiGopSize = 4
+	{  0, 3, 2, 3, 1, 3, 2, 3,
+       0, 0, 0, 0, 0, 0, 0, 0,
+       0  },  // uiGopSize = 8
+	{  0, 4, 3, 4, 2, 4, 3, 4,
+       1, 4, 3, 4, 2, 4, 3, 4,
+       0  }  //  uiGopSize = 16
+};
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// extern at svc_encode_slice.h
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+}
--- /dev/null
+++ b/codec/encoder/core/src/encoder_ext.cpp
@@ -1,0 +1,4250 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	encoder_ext.c
+ *
+ * \brief	core encoder for SVC
+ *
+ * \date	7/24/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "encoder.h"
+#include "extern.h"
+#include "encoder_context.h"
+#include "typedefs.h"
+#include "wels_const.h"
+#include "wels_common_basis.h"
+#include "codec_def.h"
+#include "param_svc.h"
+#include "cpu_core.h"
+#include "cpu.h"
+#include "utils.h"
+#include "svc_enc_frame.h"
+#include "svc_enc_golomb.h"
+#include "svc_enc_slice_segment.h"
+#include "au_set.h"
+#include "picture_handle.h"
+#include "codec_app_def.h"
+#include "svc_base_layer_md.h"
+#include "svc_encode_slice.h"
+#include "decode_mb_aux.h"
+#include "deblocking.h"
+#include "rc.h"
+#include "ref_list_mgr_svc.h"
+#include "md.h"
+#include "ls_defines.h"
+#include "set_mb_syn_cavlc.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+#include "array_stack_align.h"
+// for MT, 4/22/2010
+#include "slice_multi_threading.h"
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+#include "measure_time.h"
+#endif//DYNAMIC_SLICE_ASSIGN
+
+namespace WelsSVCEnc {
+
+
+int32_t WelsCodeOnePicPartition(	sWelsEncCtx *pCtx,
+									SLayerBSInfo *pLbi,
+									int32_t *pNalIdxInLayer,									
+									int32_t* pLayerSize,
+									int32_t iFirstMbInPartition,	// first mb inclusive in partition
+									int32_t iEndMbInPartition,	// end mb exclusive in partition
+									int32_t iStartSliceIdx
+								  );
+
+
+/*!
+ * \brief	validate checking in parameter configuration
+ * \pParam	pParam		SWelsSvcCodingParam*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t ParamValidation( SWelsSvcCodingParam *pCfg )
+{
+	float fMaxFrameRate = 0.0f;
+	const float fEpsn = 0.000001f;
+	int32_t i = 0;
+	int32_t iLastSpatialWidth	= 0;
+	int32_t	iLastSpatialHeight	= 0;
+	float fLastFrameRateIn	= 0.0f;
+	float fLastFrameRateOut	= 0.0f;
+	SDLayerParam *pLastSpatialParam = NULL;
+
+	assert( pCfg != NULL );
+
+	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
+	{
+		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];
+		if ( fDlp->fOutputFrameRate > fDlp->fInputFrameRate || (fDlp->fInputFrameRate >= -fEpsn && fDlp->fInputFrameRate <= fEpsn)
+			|| (fDlp->fOutputFrameRate >= -fEpsn && fDlp->fOutputFrameRate <= fEpsn) )
+		{
+#if defined (_DEBUG)
+			fprintf(stderr, "Invalid settings in input frame rate(%.6f) or output frame rate(%.6f) of layer #%d config file..\n",
+				fDlp->fInputFrameRate, fDlp->fOutputFrameRate, i);
+#endif
+			return 1;
+		}
+		if ( UINT_MAX == GetLogFactor(fDlp->fOutputFrameRate, fDlp->fInputFrameRate) )
+		{
+#if defined (_DEBUG)
+			fprintf(stderr, "Invalid settings in input frame rate(%.6f) and output frame rate(%.6f) of layer #%d config file: iResult of output frame rate divided by input frame rate should be power of 2(i.e,in/pOut=2^n)..\n",
+				fDlp->fInputFrameRate, fDlp->fOutputFrameRate, i);
+#endif
+			return 1;
+		}
+	}
+
+	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
+	{
+		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];
+		if ( fDlp->fInputFrameRate > fMaxFrameRate )
+			fMaxFrameRate	= fDlp->fInputFrameRate;
+	}
+
+	if ( fMaxFrameRate > fEpsn && (fMaxFrameRate - pCfg->fMaxFrameRate > fEpsn || fMaxFrameRate - pCfg->fMaxFrameRate < -fEpsn) )
+	{
+		pCfg->fMaxFrameRate	= fMaxFrameRate;		
+	}
+
+	for (i = 0; i < pCfg->iNumDependencyLayer; ++ i)
+	{
+		SDLayerParam *fDlp = &pCfg->sDependencyLayers[i];
+
+		pLastSpatialParam	= fDlp;
+		iLastSpatialWidth	= fDlp->iFrameWidth;
+		iLastSpatialHeight	= fDlp->iFrameHeight;
+		fLastFrameRateIn	= fDlp->fInputFrameRate;
+		fLastFrameRateOut	= fDlp->fOutputFrameRate;
+	}
+
+	return 0;
+}
+
+int32_t ParamValidationExt( void *pParam )
+{
+	SWelsSvcCodingParam *pCodingParam = (SWelsSvcCodingParam *)pParam;
+	int8_t i = 0;
+	int32_t iIdx = 0;
+
+	assert ( pCodingParam != NULL );
+	if ( NULL == pCodingParam )
+		return 1;
+
+	if ( pCodingParam->iNumDependencyLayer < 1 || pCodingParam->iNumDependencyLayer > MAX_DEPENDENCY_LAYER ){
+#if defined (_DEBUG)
+		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->iNumDependencyLayer: %d!\n", pCodingParam->iNumDependencyLayer);
+#endif//#if _DEBUG
+
+		return 1;
+	}
+	
+	if ( pCodingParam->iNumTemporalLayer < 1 || pCodingParam->iNumTemporalLayer > MAX_TEMPORAL_LEVEL ){
+#if defined (_DEBUG)
+		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->iNumTemporalLayer: %d!\n", pCodingParam->iNumTemporalLayer);
+#endif//#if _DEBUG
+		return 1;
+	}
+	
+	if ( pCodingParam->uiGopSize < 1 || pCodingParam->uiGopSize > MAX_GOP_SIZE ){
+#if defined (_DEBUG)
+		fprintf(stderr, "ParamValidationExt(), monitor invalid pCodingParam->uiGopSize: %d!\n", pCodingParam->uiGopSize);
+#endif//#if _DEBUG
+		return 1;
+	}
+	
+
+	if ( pCodingParam->uiIntraPeriod && pCodingParam->uiIntraPeriod < pCodingParam->uiGopSize )
+	{
+#if defined (_DEBUG)
+		fprintf(stderr, "ParamValidationExt(), uiIntraPeriod(%d) should be not less than that of uiGopSize(%d) or -1 specified!\n",
+			pCodingParam->uiIntraPeriod, pCodingParam->uiGopSize);
+#endif//#if _DEBUG
+		return 1;
+	}
+	
+	if ( pCodingParam->uiIntraPeriod && (pCodingParam->uiIntraPeriod & (pCodingParam->uiGopSize-1)) != 0 )
+	{
+#if defined (_DEBUG)
+		fprintf(stderr, "ParamValidationExt(), uiIntraPeriod(%d) should be multiple of uiGopSize(%d) or -1 specified!\n",
+			pCodingParam->uiIntraPeriod, pCodingParam->uiGopSize);
+#endif//#if _DEBUG
+		return 1;
+	}
+	
+
+#ifdef MT_ENABLED
+	//about iMultipleThreadIdc, bDeblockingParallelFlag, iLoopFilterDisableIdc, & uiSliceMode
+	// (1) Single Thread
+	//	if (THREAD==1)//single thread
+	//		no parallel_deblocking: bDeblockingParallelFlag = 0;
+	// (2) Multi Thread: see uiSliceMode decision
+	if ( pCodingParam->iMultipleThreadIdc == 1 )
+	{
+		//now is single thread. no parallel deblocking, set flag=0
+		pCodingParam->bDeblockingParallelFlag = false;
+	}
+	else
+	{
+		pCodingParam->bDeblockingParallelFlag = true;
+	}
+#else
+	pCodingParam->bDeblockingParallelFlag	= false;
+#endif//MT_ENABLED
+	
+	for ( i = 0; i < pCodingParam->iNumDependencyLayer; ++ i ){
+		SDLayerParam *fDlp = &pCodingParam->sDependencyLayers[i];
+		const int32_t kiPicWidth = fDlp->iFrameWidth;
+		const int32_t kiPicHeight= fDlp->iFrameHeight;
+		int32_t iMbWidth		= 0;
+		int32_t iMbHeight		= 0;
+		int32_t iMbNumInFrame		= 0;
+		int32_t iMaxSliceNum		= MAX_SLICES_NUM;
+		if ( kiPicWidth <= 0 || kiPicHeight <= 0 ){
+#if defined (_DEBUG)
+			fprintf(stderr, "ParamValidationExt(), invalid %d x %d in dependency layer settings!\n", kiPicWidth, kiPicHeight);
+#endif//#if _DEBUG
+			return 1;
+		}
+		if ( (kiPicWidth & 0x0F) != 0 || (kiPicHeight & 0x0F) != 0 ){
+#if defined (_DEBUG)
+			fprintf(stderr, "ParamValidationExt(), in layer #%d iWidth x iHeight(%d x %d) both should be multiple of 16, can not support with arbitrary size currently!\n", i, kiPicWidth, kiPicHeight);
+#endif//#if _DEBUG
+			return 1;
+		}	
+
+		if ( fDlp->sMso.uiSliceMode >= SM_RESERVED ){
+#if defined (_DEBUG)
+			fprintf(stderr, "ParamValidationExt(), invalid uiSliceMode (%d) settings!\n", fDlp->sMso.uiSliceMode );
+#endif//#if _DEBUG
+			return 1;
+		}
+
+		//check pSlice settings under multi-pSlice
+		if ( kiPicWidth<=16 && kiPicHeight<=16 ){
+			//only have one MB, set to single_slice
+			fDlp->sMso.uiSliceMode = SM_SINGLE_SLICE;
+		}
+		switch ( fDlp->sMso.uiSliceMode )
+		{
+			case SM_SINGLE_SLICE:
+				fDlp->sMso.sSliceArgument.iSliceNum = 1;
+				fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
+				fDlp->sMso.sSliceArgument.iSliceNum = 0;
+				for (iIdx=0; iIdx<MAX_SLICES_NUM;iIdx++)
+				{
+					fDlp->sMso.sSliceArgument.uiSliceMbNum[iIdx] = 0;
+				}
+				break;
+			case SM_FIXEDSLCNUM_SLICE:
+				{
+					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
+
+					iMbWidth	= (kiPicWidth+15)>>4;
+					iMbHeight	= (kiPicHeight+15)>>4;
+					iMbNumInFrame = iMbWidth * iMbHeight;
+					iMaxSliceNum = MAX_SLICES_NUM;
+					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 
+						|| fDlp->sMso.sSliceArgument.iSliceNum > iMaxSliceNum )
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings!\n", fDlp->sMso.sSliceArgument.iSliceNum );
+#endif//#if _DEBUG
+						return 1;
+					}
+					if (fDlp->sMso.sSliceArgument.iSliceNum == 1)
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), uiSliceNum(%d) you set for SM_FIXEDSLCNUM_SLICE, now turn to SM_SINGLE_SLICE type!\n", fDlp->sMso.sSliceArgument.iSliceNum );
+#endif//#if _DEBUG
+						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+						break;
+					}
+					if (pCodingParam->bEnableRc)	// multiple slices verify with gom
+					{		
+						//check uiSliceNum
+						GomValidCheckSliceNum( iMbWidth, iMbHeight, (int32_t*)&fDlp->sMso.sSliceArgument.iSliceNum );
+						assert(fDlp->sMso.sSliceArgument.iSliceNum > 1);
+						//set uiSliceMbNum with current uiSliceNum
+						GomValidCheckSliceMbNum( iMbWidth, iMbHeight, &fDlp->sMso.sSliceArgument );
+					}
+					else if ( !CheckFixedSliceNumMultiSliceSetting( iMbNumInFrame, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
+					{//check uiSliceMbNum with current uiSliceNum
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
+#endif//#if _DEBUG
+						return 1;
+					}
+					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
+					if ( iMbNumInFrame <= MIN_NUM_MB_PER_SLICE )
+					{
+						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
+						break;
+					}
+				}
+				break;
+			case SM_RASTER_SLICE:
+				{
+					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
+
+					iMbWidth	= (kiPicWidth+15)>>4;
+					iMbHeight	= (kiPicHeight+15)>>4;
+					iMbNumInFrame = iMbWidth * iMbHeight;
+					iMaxSliceNum = MAX_SLICES_NUM;
+					if ( fDlp->sMso.sSliceArgument.uiSliceMbNum[0] <= 0 )
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
+#endif//#if _DEBUG
+						return 1;
+					}
+
+					if ( !CheckRasterMultiSliceSetting( iMbNumInFrame, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
+#endif//#if _DEBUG
+						return 1;
+					}
+					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 || fDlp->sMso.sSliceArgument.iSliceNum > iMaxSliceNum )	// verify interleave mode settings
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) in SM_RASTER_SLICE settings!\n",  fDlp->sMso.sSliceArgument.iSliceNum );
+#endif//#if _DEBUG
+						return 1;
+					}
+					if (fDlp->sMso.sSliceArgument.iSliceNum == 1)
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), pSlice setting for SM_RASTER_SLICE now turn to SM_SINGLE_SLICE!\n" );
+#endif//#if _DEBUG
+						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+						break;
+					}
+#ifdef MT_ENABLED
+					if (pCodingParam->bEnableRc && fDlp->sMso.sSliceArgument.iSliceNum > 1)
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), WARNING: GOM based RC do not support SM_RASTER_SLICE!\n" );
+#endif//#if _DEBUG
+					}
+#endif
+					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
+					if ( iMbNumInFrame <= MIN_NUM_MB_PER_SLICE )
+					{
+						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
+						break;
+					}
+				}
+				break;		
+			case SM_ROWMB_SLICE:
+				{
+					fDlp->sMso.sSliceArgument.uiSliceSizeConstraint = 0;
+
+					iMbWidth	= (kiPicWidth+15)>>4;
+					iMbHeight	= (kiPicHeight+15)>>4;
+					iMaxSliceNum = MAX_SLICES_NUM;
+					if ( iMbHeight > iMaxSliceNum )
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings more than MAX!\n", iMbHeight );
+#endif//#if _DEBUG
+						return 1;
+					}
+					fDlp->sMso.sSliceArgument.iSliceNum	= iMbHeight;
+
+					if ( fDlp->sMso.sSliceArgument.iSliceNum <= 0 )
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid uiSliceNum (%d) settings!\n", fDlp->sMso.sSliceArgument.iSliceNum );
+#endif//#if _DEBUG
+						return 1;
+					}	
+					if ( !CheckRowMbMultiSliceSetting( iMbWidth, &fDlp->sMso.sSliceArgument ) )	// verify interleave mode settings
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid uiSliceMbNum (%d) settings!\n",  fDlp->sMso.sSliceArgument.uiSliceMbNum[0] );
+#endif//#if _DEBUG
+						return 1;
+					}
+				}
+				break;
+			case SM_DYN_SLICE:
+				{
+					iMbWidth	= (kiPicWidth+15)>>4;
+					iMbHeight	= (kiPicHeight+15)>>4;
+					if ( fDlp->sMso.sSliceArgument.uiSliceSizeConstraint <= 0 )
+					{
+#if defined (_DEBUG)
+						fprintf(stderr, "ParamValidationExt(), invalid iSliceSize (%d) settings!\n",   fDlp->sMso.sSliceArgument.uiSliceSizeConstraint );
+#endif//#if _DEBUG
+						return 1;
+					}
+					// considering the coding efficient and performance, iCountMbNum constraint by MIN_NUM_MB_PER_SLICE condition of multi-pSlice mode settting
+					if ( iMbWidth * iMbHeight <= MIN_NUM_MB_PER_SLICE )
+					{
+						fDlp->sMso.uiSliceMode	= SM_SINGLE_SLICE;
+						fDlp->sMso.sSliceArgument.iSliceNum	= 1;
+						break;
+					}
+				}
+				break;
+			default:
+				{
+
+#if defined (_DEBUG)
+					fprintf(stderr, "ParamValidationExt(), invalid uiSliceMode (%d) settings!\n", pCodingParam->sDependencyLayers[0].sMso.uiSliceMode );
+#endif//#if _DEBUG
+					return 1;
+
+				}
+				break;
+		}
+	}
+	
+	return ParamValidation(pCodingParam);
+}
+
+/*!
+ * \brief	acquire count number of layers and NALs based on configurable paramters dependency
+ * \pParam	pCtx				sWelsEncCtx*
+ * \pParam	pParam			SWelsSvcCodingParam*
+ * \pParam	pCountLayers	pointer of count number of layers indeed
+ * \pParam	iCountNals		pointer of count number of nals indeed
+ * \return	0 - successful; otherwise failed
+ */
+static inline int32_t AcquireLayersNals( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pParam, int32_t *pCountLayers, int32_t *pCountNals )
+{	
+	int32_t iCountNumLayers		= 0;
+	int32_t iCountNumNals			= 0;
+	int32_t iNumDependencyLayers	= 0;
+	int32_t iDIndex 				= 0;
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+	int32_t iNumLayersPack = 0;
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+
+	if ( NULL == pParam || NULL == ppCtx || NULL == *ppCtx )
+		return 1;
+	
+	iNumDependencyLayers	= pParam->iNumDependencyLayer;
+
+	do {
+		SDLayerParam *pDLayer = &pParam->sDependencyLayers[iDIndex];		
+//		pDLayer->ptr_cfg = pParam;
+		int32_t iOrgNumNals = iCountNumNals;
+
+		//Note: Sep. 2010
+		//Review this part and suggest no change, since the memory over-use 
+		//(1) counts little to the overall performance
+		//(2) should not be critial even under mobile case
+		if ( SM_DYN_SLICE == pDLayer->sMso.uiSliceMode )
+		{
+			iCountNumNals += MAX_SLICES_NUM;
+			// plus prefix NALs
+			if ( iDIndex == 0 )
+				iCountNumNals += MAX_SLICES_NUM;
+			// MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME ensured at svc_enc_slice_segment.h
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+			assert(MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME);
+			// iNumLayersPack += MAX_SLICES_NUM; // do not count it for dynamic slicing mode
+#else//!MT_ENABLED || !PACKING_ONE_SLICE_PER_LAYER
+			assert(iCountNumNals - iOrgNumNals <= MAX_NAL_UNITS_IN_LAYER );
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+		}
+		else /*if ( SM_SINGLE_SLICE != pDLayer->sMso.uiSliceMode )*/
+		{
+			const int32_t kiNumOfSlice = GetInitialSliceNum(	(pDLayer->iFrameWidth+0x0f)>>4,
+												(pDLayer->iFrameHeight+0x0f)>>4,
+												&pDLayer->sMso );
+
+			// NEED check iCountNals value in case multiple slices is used
+			iCountNumNals += kiNumOfSlice; // for pSlice VCL NALs
+			// plus prefix NALs
+			if ( iDIndex == 0 )
+				iCountNumNals += kiNumOfSlice;
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+			assert(num_of_slice <= MAX_SLICES_NUM && MAX_SLICES_NUM < MAX_LAYER_NUM_OF_FRAME);
+			iNumLayersPack += num_of_slice;
+#else//!MT_ENABLED || !PACKING_ONE_SLICE_PER_LAYER
+			assert(iCountNumNals - iOrgNumNals <= MAX_NAL_UNITS_IN_LAYER );				
+#endif//MT_ENALBED && PACKING_ONE_SLICE_PER_LAYER
+			if ( kiNumOfSlice > MAX_SLICES_NUM )
+			{
+				WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_of_slice(%d) > MAX_SLICES_NUM(%d) per (iDid= %d, qid= %d) settings!\n",
+					kiNumOfSlice, MAX_SLICES_NUM, iDIndex, 0 );
+				return 1;
+			}
+		}
+#if !defined(MT_ENABLED) || !defined(PACKING_ONE_SLICE_PER_LAYER)
+		if ( iCountNumNals - iOrgNumNals > MAX_NAL_UNITS_IN_LAYER )
+		{
+			WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_of_nals(%d) > MAX_NAL_UNITS_IN_LAYER(%d) per (iDid= %d, qid= %d) settings!\n",
+				(iCountNumNals - iOrgNumNals), MAX_NAL_UNITS_IN_LAYER, iDIndex, 0 );
+			return 1;
+		}
+#endif//!MT_ENABLED) || !PACKING_ONE_SLICE_PER_LAYER
+
+		iCountNumLayers ++;
+		
+		++ iDIndex;
+	} while(iDIndex < iNumDependencyLayers);
+
+	iCountNumNals += 1 + iNumDependencyLayers + (iCountNumLayers<<1) + iCountNumLayers;	// plus iCountNumLayers for reserved application
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+	iNumLayersPack += 1 + iNumDependencyLayers + (iCountNumLayers<<1);
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+
+	// to check number of layers / nals / slices dependencies, 12/8/2010
+#if !defined(MT_ENABLED)
+	if ( iCountNumLayers > MAX_LAYER_NUM_OF_FRAME )
+	{
+		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), iCountNumLayers(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iCountNumLayers, MAX_LAYER_NUM_OF_FRAME );
+		return 1;
+	}
+#else//MT_ENABLED
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+	if ( iNumLayersPack > MAX_LAYER_NUM_OF_FRAME )
+	{
+		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), num_layers_pack_overall(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iNumLayersPack, MAX_LAYER_NUM_OF_FRAME );
+		return 1;
+	}
+#else//!PACKING_ONE_SLICE_PER_LAYER
+	if ( iCountNumLayers > MAX_LAYER_NUM_OF_FRAME )
+	{
+		WelsLog( *ppCtx, WELS_LOG_ERROR, "AcquireLayersNals(), iCountNumLayers(%d) > MAX_LAYER_NUM_OF_FRAME(%d)!", iCountNumLayers, MAX_LAYER_NUM_OF_FRAME );
+		return 1;
+	}
+#endif//PACKING_ONE_SLICE_PER_LAYER
+#endif//!MT_ENABLED
+
+	if ( NULL != pCountLayers )
+		*pCountLayers	= iCountNumLayers;
+	if ( NULL != pCountNals )
+		*pCountNals 	= iCountNumNals;
+	return 0;
+}
+
+/*!
+ * \brief	alloc spatial layers pictures (I420 based source pictures)	
+ */
+int32_t AllocSpatialPictures( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pParam )
+{
+	CMemoryAlign *pMa						= (*ppCtx)->pMemAlign;	
+	const int32_t kiDlayerCount					= pParam->iNumDependencyLayer;
+	int32_t iDlayerIndex							= 0;
+
+	// spatial pictures
+	iDlayerIndex = 0;
+	do {
+		const int32_t kiPicWidth = pParam->sDependencyLayers[iDlayerIndex].iFrameWidth;
+		const int32_t kiPicHeight   = pParam->sDependencyLayers[iDlayerIndex].iFrameHeight;
+		const uint8_t kuiLayerInTemporal = 2 + WELS_MAX(pParam->sDependencyLayers[iDlayerIndex].iHighestTemporalId, 1);
+		const uint8_t kuiRefNumInTemporal = kuiLayerInTemporal + pParam->iLTRRefNum;
+		uint8_t i = 0;
+
+		do {
+			SPicture *pPic = AllocPicture( pMa, kiPicWidth, kiPicHeight, false );
+			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pPic), FreeMemorySvc(ppCtx); *ppCtx = NULL )
+			(*ppCtx)->pSpatialPic[iDlayerIndex][i] = pPic;
+			++ i;
+		} while( i < kuiRefNumInTemporal);
+
+		(*ppCtx)->uiSpatialLayersInTemporal[iDlayerIndex] = kuiLayerInTemporal;
+		(*ppCtx)->uiSpatialPicNum[iDlayerIndex] = kuiRefNumInTemporal;
+		++ iDlayerIndex;
+	} while( iDlayerIndex < kiDlayerCount );  
+
+	return 0;
+}
+
+void FreeSpatialPictures( sWelsEncCtx *pCtx )
+{
+	CMemoryAlign *pMa	= pCtx->pMemAlign;
+	int32_t j = 0;
+	while( j < pCtx->pSvcParam->iNumDependencyLayer )
+	{
+		uint8_t i = 0;
+		uint8_t uiRefNumInTemporal = pCtx->uiSpatialPicNum[j];
+
+		while( i < uiRefNumInTemporal ) 
+		{
+			if ( NULL != pCtx->pSpatialPic[j][i] )
+			{
+				FreePicture( pMa, &pCtx->pSpatialPic[j][i] );			
+			}
+			++ i;
+		}
+		pCtx->uiSpatialLayersInTemporal[j]	= 0;
+		++ j;
+	}
+
+}
+
+static  void  InitMbInfo(sWelsEncCtx * pEnc, SMB  * pList, SDqLayer * pLayer, const int32_t kiDlayerId, const int32_t kiMaxMbNum )
+{
+    int32_t  iMbWidth		= pLayer->iMbWidth;
+	int32_t  iMbHeight		= pLayer->iMbHeight;
+	int32_t  iIdx;
+	int32_t  iMbNum			= iMbWidth * iMbHeight;
+	SSliceCtx *pSliceCtx= pLayer->pSliceEncCtx;
+	uint32_t uiNeighborAvail;    
+	const int32_t kiOffset	= (kiDlayerId & 0x01) * kiMaxMbNum;
+	SMVUnitXY (*pLayerMvUnitBlock4x4)[MB_BLOCK4x4_NUM]	= (SMVUnitXY(*)[MB_BLOCK4x4_NUM])(&pEnc->pMvUnitBlock4x4[MB_BLOCK4x4_NUM*kiOffset]);	
+	int8_t (*pLayerRefIndexBlock8x8)[MB_BLOCK8x8_NUM]		= (int8_t(*)[MB_BLOCK8x8_NUM])(&pEnc->pRefIndexBlock4x4[MB_BLOCK8x8_NUM*kiOffset]);	
+
+	for( iIdx = 0; iIdx<iMbNum; iIdx++ ){
+		BOOL_T     bLeft;
+		BOOL_T     bTop;
+		BOOL_T     bLeftTop;
+		BOOL_T     bRightTop;
+		int32_t  iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
+		uint8_t  uiSliceIdc;
+		
+		pList[iIdx].iMbX = pEnc->pStrideTab->pMbIndexX[kiDlayerId][iIdx];
+		pList[iIdx].iMbY = pEnc->pStrideTab->pMbIndexY[kiDlayerId][iIdx];
+		pList[iIdx].iMbXY = iIdx;		
+
+        uiSliceIdc = WelsMbToSliceIdc(pSliceCtx, iIdx);
+		iLeftXY = iIdx - 1;
+		iTopXY = iIdx - iMbWidth;
+		iLeftTopXY = iTopXY - 1;
+		iRightTopXY = iTopXY + 1;
+
+		bLeft = (pList[iIdx].iMbX > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftXY));
+		bTop = (pList[iIdx].iMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iTopXY));
+		bLeftTop = (pList[iIdx].iMbX > 0) && (pList[iIdx].iMbY > 0) && (uiSliceIdc == 
+			WelsMbToSliceIdc(pSliceCtx, iLeftTopXY));
+		bRightTop = (pList[iIdx].iMbX < (iMbWidth-1)) && (pList[iIdx].iMbY > 0) && (uiSliceIdc == 
+			WelsMbToSliceIdc(pSliceCtx, iRightTopXY));
+
+		uiNeighborAvail = 0;
+		if( bLeft ){
+			uiNeighborAvail |= LEFT_MB_POS;
+		}
+		if( bTop ){
+			uiNeighborAvail |= TOP_MB_POS;
+		}
+		if( bLeftTop ){
+			uiNeighborAvail |= TOPLEFT_MB_POS;
+		}
+		if( bRightTop ){
+			uiNeighborAvail |= TOPRIGHT_MB_POS;
+		}
+		pList[iIdx].uiSliceIdc		= uiSliceIdc;	// merge from svc_hd_opt_b for multiple slices coding
+		pList[iIdx].uiNeighborAvail	= uiNeighborAvail;
+		uiNeighborAvail = 0;
+		if(pList[iIdx].iMbX >= BASE_MV_MB_NMB)
+			uiNeighborAvail |= LEFT_MB_POS;
+        if(pList[iIdx].iMbX <= (iMbWidth-1-BASE_MV_MB_NMB))
+            uiNeighborAvail |= RIGHT_MB_POS;
+		if (pList[iIdx].iMbY >= BASE_MV_MB_NMB)
+			uiNeighborAvail |= TOP_MB_POS;
+        if(pList[iIdx].iMbY <= (iMbHeight-1-BASE_MV_MB_NMB))
+            uiNeighborAvail |= BOTTOM_MB_POS;
+
+		pList[iIdx].sMv					= pLayerMvUnitBlock4x4[iIdx];
+		pList[iIdx].pRefIndex			= pLayerRefIndexBlock8x8[iIdx];
+		pList[iIdx].pSadCost				= &pEnc->pSadCostMb[iIdx];
+		pList[iIdx].pIntra4x4PredMode	= &pEnc->pIntra4x4PredModeBlocks[iIdx*INTRA_4x4_MODE_NUM];
+		pList[iIdx].pNonZeroCount		= &pEnc->pNonZeroCountBlocks[iIdx*MB_LUMA_CHROMA_BLOCK4x4_NUM];		
+	}
+}
+
+
+int32_t   InitMbListD( sWelsEncCtx ** ppCtx)
+{
+    int32_t		iNumDlayer = (*ppCtx)->pSvcParam->iNumDependencyLayer;	
+	int32_t		iMbSize[MAX_DEPENDENCY_LAYER] = { 0 };
+	int32_t		iOverallMbNum = 0;
+	int32_t		iMbWidth = 0;
+	int32_t		iMbHeight= 0;
+	int32_t		i;
+
+	if ( iNumDlayer > MAX_DEPENDENCY_LAYER )
+		return 1;
+
+	for( i=0;i<iNumDlayer;i++ ){
+	    iMbWidth = ((*ppCtx)->pSvcParam->sDependencyLayers[i].iFrameWidth + 15)>>4;
+		iMbHeight = ((*ppCtx)->pSvcParam->sDependencyLayers[i].iFrameHeight + 15)>>4;
+		iMbSize[i] = iMbWidth  * iMbHeight;
+		iOverallMbNum += iMbSize[i];
+	}
+
+	(*ppCtx)->ppMbListD = static_cast<SMB **>((*ppCtx)->pMemAlign->WelsMalloc(iNumDlayer * sizeof(SMB *), "ppMbListD"));
+	(*ppCtx)->ppMbListD[0] = NULL;
+	WELS_VERIFY_RETURN_PROC_IF(1, (*ppCtx)->ppMbListD==NULL, FreeMemorySvc(ppCtx));
+	(*ppCtx)->ppMbListD[0] = static_cast<SMB*>((*ppCtx)->pMemAlign->WelsMallocz(iOverallMbNum * sizeof(SMB), "ppMbListD[0]"));
+	WELS_VERIFY_RETURN_PROC_IF(1, (*ppCtx)->ppMbListD[0]==NULL, FreeMemorySvc(ppCtx));
+	(*ppCtx)->ppDqLayerList[0]->sMbDataP = (*ppCtx)->ppMbListD[0];
+	InitMbInfo(*ppCtx, (*ppCtx)->ppMbListD[0], (*ppCtx)->ppDqLayerList[0], 0, iMbSize[iNumDlayer-1]);
+	for( i=1;i<iNumDlayer;i++ ){		
+		(*ppCtx)->ppMbListD[i] = (*ppCtx)->ppMbListD[i-1] + iMbSize[i-1];
+		(*ppCtx)->ppDqLayerList[i]->sMbDataP = (*ppCtx)->ppMbListD[i];
+		InitMbInfo(*ppCtx, (*ppCtx)->ppMbListD[i], (*ppCtx)->ppDqLayerList[i], i, iMbSize[iNumDlayer-1]);
+	}
+
+	return 0;
+}
+
+int32_t AllocMbCacheAligned( SMbCache *pMbCache, CMemoryAlign *pMa )
+{
+	pMbCache->pCoeffLevel = (int16_t *)pMa->WelsMalloc(MB_COEFF_LIST_SIZE*sizeof(int16_t), "pMbCache->pCoeffLevel");
+	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pCoeffLevel));
+	pMbCache->pMemPredMb = (uint8_t *)pMa->WelsMalloc(2*256*sizeof(uint8_t), "pMbCache->pMemPredMb");
+	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pMemPredMb));
+	pMbCache->pSkipMb = (uint8_t *)pMa->WelsMalloc(384*sizeof(uint8_t), "pMbCache->pSkipMb");
+	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pSkipMb));
+	pMbCache->pMemPredBlk4 = (uint8_t *)pMa->WelsMalloc(2*16*sizeof(uint8_t), "pMbCache->pMemPredBlk4");
+	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pMemPredBlk4));
+	pMbCache->pBufferInterPredMe = (uint8_t *)pMa->WelsMalloc(4*640*sizeof(uint8_t), "pMbCache->pBufferInterPredMe");
+	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pBufferInterPredMe));
+	pMbCache->pPrevIntra4x4PredModeFlag = (bool_t *)pMa->WelsMalloc(16*sizeof(bool_t), "pMbCache->pPrevIntra4x4PredModeFlag");
+	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pPrevIntra4x4PredModeFlag));
+	pMbCache->pRemIntra4x4PredModeFlag	= (int8_t *)pMa->WelsMalloc(16*sizeof(int8_t), "pMbCache->pRemIntra4x4PredModeFlag");
+	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pRemIntra4x4PredModeFlag));
+	pMbCache->pDct = (SDCTCoeff *)pMa->WelsMalloc(sizeof(SDCTCoeff), "pMbCache->pDct");
+	WELS_VERIFY_RETURN_IF(1, (NULL==pMbCache->pDct));	
+	return 0;
+}
+
+void FreeMbCache( SMbCache *pMbCache, CMemoryAlign *pMa )
+{
+	if ( NULL != pMbCache->pCoeffLevel )
+	{
+		pMa->WelsFree( pMbCache->pCoeffLevel, "pMbCache->pCoeffLevel" );
+		pMbCache->pCoeffLevel = NULL;
+	}		
+	if ( NULL != pMbCache->pMemPredMb )
+	{
+		pMa->WelsFree( pMbCache->pMemPredMb, "pMbCache->pMemPredMb" );
+		pMbCache->pMemPredMb = NULL;
+	}	
+	if ( NULL != pMbCache->pSkipMb )
+	{
+		pMa->WelsFree( pMbCache->pSkipMb, "pMbCache->pSkipMb" );
+		pMbCache->pSkipMb = NULL;
+	}	
+	if ( NULL != pMbCache->pMemPredBlk4 )
+	{
+		pMa->WelsFree( pMbCache->pMemPredBlk4, "pMbCache->pMemPredBlk4" );
+		pMbCache->pMemPredBlk4 = NULL;
+	}	
+	if ( NULL != pMbCache->pBufferInterPredMe )
+	{
+		pMa->WelsFree( pMbCache->pBufferInterPredMe, "pMbCache->pBufferInterPredMe" );
+		pMbCache->pBufferInterPredMe = NULL;
+	}	
+	if ( NULL != pMbCache->pPrevIntra4x4PredModeFlag )
+	{
+		pMa->WelsFree( pMbCache->pPrevIntra4x4PredModeFlag, "pMbCache->pPrevIntra4x4PredModeFlag" );
+		pMbCache->pPrevIntra4x4PredModeFlag = NULL;
+	}	
+	if ( NULL != pMbCache->pRemIntra4x4PredModeFlag )
+	{
+		pMa->WelsFree( pMbCache->pRemIntra4x4PredModeFlag, "pMbCache->pRemIntra4x4PredModeFlag" );
+		pMbCache->pRemIntra4x4PredModeFlag = NULL;
+	}	
+	if ( NULL != pMbCache->pDct )
+	{
+		pMa->WelsFree( pMbCache->pDct, "pMbCache->pDct" );
+		pMbCache->pDct = NULL;
+	}
+}
+
+
+/*!
+ * \brief	initialize ppDqLayerList and slicepEncCtx_list due to count number of layers available
+ * \pParam	pCtx			sWelsEncCtx*
+ * \return	0 - successful; otherwise failed
+ */
+static inline int32_t InitDqLayers( sWelsEncCtx **ppCtx )
+{
+	SWelsSvcCodingParam *pParam	= NULL;
+	SWelsSPS *pSps						= NULL;
+	SSubsetSps *pSubsetSps			= NULL;
+	SWelsPPS *pPps						= NULL;
+	CMemoryAlign *pMa				= NULL;
+	SStrideTables *pStrideTab		= NULL;	
+	int32_t iDlayerCount					= 0;	
+	int32_t iDlayerIndex					= 0;
+	uint32_t iSpsId					= 0;
+	uint32_t iPpsId					= 0;
+	uint32_t iNumRef				= 0;
+	int32_t iResult					= 0;	
+	
+	if ( NULL == ppCtx || NULL == *ppCtx )
+		return 1;
+
+	pMa		= (*ppCtx)->pMemAlign;
+	pParam	= (*ppCtx)->pSvcParam;	
+	iDlayerCount	= pParam->iNumDependencyLayer;
+	iNumRef	= pParam->iNumRefFrame;
+//	highest_layers_in_temporal = 1 + WELS_MAX(pParam->iDecompStages, 1);
+	pStrideTab	= (*ppCtx)->pStrideTab;	
+
+	iDlayerIndex			= 0;	
+	while (iDlayerIndex < iDlayerCount)
+	{
+		SRefList *pRefList			= NULL;
+		uint32_t i					= 0;
+		const int32_t kiWidth			= pParam->sDependencyLayers[iDlayerIndex].iFrameWidth;
+		const int32_t kiHeight		= pParam->sDependencyLayers[iDlayerIndex].iFrameHeight;
+        int32_t iPicWidth			= WELS_ALIGN(kiWidth, MB_WIDTH_LUMA) + (PADDING_LENGTH<<1);	// with iWidth of horizon
+        int32_t iPicChromaWidth	= iPicWidth >> 1;
+
+        iPicWidth	= WELS_ALIGN( iPicWidth, 32 );	// 32(or 16 for chroma below) to match original imp. here instead of iCacheLineSize
+		iPicChromaWidth	= WELS_ALIGN( iPicChromaWidth, 16 );
+
+		WelsGetEncBlockStrideOffset( (*ppCtx)->pStrideTab->pStrideEncBlockOffset[iDlayerIndex], iPicWidth, iPicChromaWidth);		
+		
+		// pRef list
+		pRefList		= (SRefList *)pMa->WelsMallocz( sizeof(SRefList), "pRefList" );
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pRefList), FreeMemorySvc(ppCtx) )			
+		
+		do {
+			pRefList->pRef[i]	= AllocPicture( pMa, kiWidth, kiHeight, true );	// to use actual size of current layer
+			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pRefList->pRef[i]), FreeMemorySvc(ppCtx) )				
+			++ i;
+		} while(i < 1 + iNumRef);
+		
+		pRefList->pNextBuffer = pRefList->pRef[0];			
+		(*ppCtx)->ppRefPicListExt[iDlayerIndex]	= pRefList;		
+		++ iDlayerIndex;
+	}
+
+	// for I420 based source spatial pictures
+	if ( AllocSpatialPictures( ppCtx, pParam ) )
+	{
+		FreeMemorySvc( ppCtx );		
+		return 1;
+	}
+
+	iDlayerIndex	= 0;
+	while (iDlayerIndex < iDlayerCount) {
+		SDqLayer *pDqLayer		= NULL;
+		SDLayerParam *pDlayer	= &pParam->sDependencyLayers[iDlayerIndex];		
+		const int32_t kiMbW		= (pDlayer->iFrameWidth + 0x0f) >> 4;
+		const int32_t kiMbH		= (pDlayer->iFrameHeight + 0x0f) >> 4;
+		int32_t iMaxSliceNum	= 1;
+		const int32_t kiSliceNum = GetInitialSliceNum( kiMbW, kiMbH, &pDlayer->sMso );
+		if ( iMaxSliceNum < kiSliceNum )
+			iMaxSliceNum = kiSliceNum;
+
+		// pDq layers list
+		pDqLayer = (SDqLayer *)pMa->WelsMallocz( sizeof(SDqLayer), "pDqLayer" );
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer), FreeMemorySvc(ppCtx) )
+
+		// for dynamic slicing mode
+		if ( SM_DYN_SLICE == pDlayer->sMso.uiSliceMode )
+		{			
+			const int32_t iSize			= pParam->iCountThreadsNum * sizeof(int32_t);
+
+			pDqLayer->pNumSliceCodedOfPartition		= (int32_t *)pMa->WelsMallocz( iSize, "pNumSliceCodedOfPartition" );			
+			pDqLayer->pLastCodedMbIdxOfPartition	= (int32_t *)pMa->WelsMallocz( iSize, "pLastCodedMbIdxOfPartition" );
+			pDqLayer->pLastMbIdxOfPartition			= (int32_t *)pMa->WelsMallocz( iSize, "pLastMbIdxOfPartition" );
+
+			WELS_VERIFY_RETURN_PROC_IF( 1,
+										(NULL == pDqLayer->pNumSliceCodedOfPartition ||
+										NULL == pDqLayer->pLastCodedMbIdxOfPartition ||
+										NULL == pDqLayer->pLastMbIdxOfPartition),
+										FreeMemorySvc(ppCtx) )
+		}
+
+		pDqLayer->iMbWidth					= kiMbW;
+		pDqLayer->iMbHeight					= kiMbH;
+#ifndef MT_ENABLED
+		if ( SM_DYN_SLICE == pDlayer->sMso.uiSliceMode )//wmalloc pSliceInLayer
+		{	
+			SSlice *pSlice			= NULL;
+			int32_t iSliceIdx		= 0;
+			//wmalloc AVERSLICENUM_CONSTANT of pDqLayer->sLayerInfo.pSliceInLayer, 
+			//wmalloc AVERSLICENUM_CONSTANT num of pSlice as initialization			
+			//only set value for the first pSlice
+			pDqLayer->sLayerInfo.pSliceInLayer	= (SSlice *)pMa->WelsMallocz( sizeof(SSlice) * iMaxSliceNum, "pSliceInLayer" );
+
+			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer->sLayerInfo.pSliceInLayer), FreeMemorySvc(ppCtx) )
+			{
+				pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[0];
+				pSlice->uiSliceIdx = 0;
+				pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
+			}		
+
+			while(iSliceIdx < iMaxSliceNum)
+			{
+				pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx];
+				if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
+				{
+					FreeMemorySvc(ppCtx);
+					return 1;
+				}
+				++ iSliceIdx;
+			}
+		}
+		else
+#endif//!MT_ENABLED
+		{			
+			int32_t iSliceIdx		= 0;
+			pDqLayer->sLayerInfo.pSliceInLayer	= (SSlice *)pMa->WelsMallocz( sizeof(SSlice) * iMaxSliceNum, "pSliceInLayer" );
+
+			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pDqLayer->sLayerInfo.pSliceInLayer), FreeMemorySvc(ppCtx) )
+			if ( iMaxSliceNum > 1 )
+			{
+				while (iSliceIdx < iMaxSliceNum) {
+					SSlice *pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx];
+					pSlice->uiSliceIdx = iSliceIdx;
+#ifdef MT_ENABLED
+					if ( pParam->iMultipleThreadIdc > 1 )
+						pSlice->pSliceBsa = &(*ppCtx)->pSliceBs[iSliceIdx].sBsWrite;
+					else
+						pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
+#else
+					pSlice->pSliceBsa = &(*ppCtx)->pOut->sBsWrite;
+#endif//MT_ENABLED
+					if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
+					{
+						FreeMemorySvc(ppCtx);
+						return 1;
+					}
+					++ iSliceIdx;
+				}
+			}
+			// fix issue in case single pSlice coding might be inclusive exist in variant spatial layer setting, also introducing multi-pSlice modes
+			else	// only one pSlice
+			{
+				SSlice *pSlice = &pDqLayer->sLayerInfo.pSliceInLayer[0];
+				pSlice->uiSliceIdx	= 0;
+				pSlice->pSliceBsa	= &(*ppCtx)->pOut->sBsWrite;
+				if ( AllocMbCacheAligned(&pSlice->sMbCacheInfo, pMa) )
+				{
+					FreeMemorySvc(ppCtx);
+					return 1;
+				}
+			}
+		}
+
+		//deblocking parameters initialization
+		//target-layer deblocking
+		pDqLayer->iLoopFilterDisableIdc	                = pParam->iLoopFilterDisableIdc;
+		pDqLayer->iLoopFilterAlphaC0Offset				= (pParam->iLoopFilterAlphaC0Offset)<<1;
+		pDqLayer->iLoopFilterBetaOffset					= (pParam->iLoopFilterBetaOffset)<<1;
+		//inter-layer deblocking
+		pDqLayer->uiDisableInterLayerDeblockingFilterIdc	= pParam->iInterLayerLoopFilterDisableIdc;
+		pDqLayer->iInterLayerSliceAlphaC0Offset				= (pParam->iInterLayerLoopFilterAlphaC0Offset)<<1;
+		pDqLayer->iInterLayerSliceBetaOffset				= (pParam->iInterLayerLoopFilterBetaOffset)<<1;
+		//parallel deblocking
+		pDqLayer->bDeblockingParallelFlag                  = pParam->bDeblockingParallelFlag;
+
+		//deblocking parameter adjustment
+		if ( SM_SINGLE_SLICE == pDlayer->sMso.uiSliceMode )
+		{
+			//iLoopFilterDisableIdc: will be 0 or 1 under single_slice
+			if ( 2 == pParam->iLoopFilterDisableIdc )
+			{
+				pDqLayer->iLoopFilterDisableIdc	= 0;
+			}
+			//bDeblockingParallelFlag
+			pDqLayer->bDeblockingParallelFlag = false;
+		}
+		else
+		{//multi-pSlice
+#ifdef MT_ENABLED
+			if ( 0 == pDqLayer->iLoopFilterDisableIdc )
+			{
+				pDqLayer->bDeblockingParallelFlag	= false;
+			}
+#endif
+		}
+
+		(*ppCtx)->ppDqLayerList[iDlayerIndex]	= pDqLayer;
+		
+		++ iDlayerIndex;
+	}
+
+	// for dynamically malloc for parameter sets memory instead of maximal items for standard to reduce size, 3/18/2010
+	if ( &(*ppCtx)->pSvcParam->bMgsT0OnlyStrategy )
+	{
+	    (*ppCtx)->pPPSArray	= (SWelsPPS *)pMa->WelsMalloc( (1+iDlayerCount) * sizeof(SWelsPPS), "pPPSArray" );
+	}
+	else
+	{
+	    (*ppCtx)->pPPSArray	= (SWelsPPS *)pMa->WelsMalloc( iDlayerCount * sizeof(SWelsPPS), "pPPSArray" );
+	}
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pPPSArray), FreeMemorySvc(ppCtx) )
+
+	(*ppCtx)->pSpsArray	= (SWelsSPS *)pMa->WelsMalloc( sizeof(SWelsSPS), "pSpsArray" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSpsArray), FreeMemorySvc(ppCtx) )
+	if ( iDlayerCount > 1 )
+	{
+		(*ppCtx)->pSubsetArray	= (SSubsetSps *)pMa->WelsMalloc( (iDlayerCount-1) * sizeof(SSubsetSps), "pSubsetArray" );
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSubsetArray), FreeMemorySvc(ppCtx) )
+	}
+	
+	(*ppCtx)->pDqIdcMap	= (SDqIdc *)pMa->WelsMallocz( iDlayerCount * sizeof(SDqIdc), "pDqIdcMap" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pDqIdcMap), FreeMemorySvc(ppCtx) )	
+
+	iDlayerIndex	= 0;
+	while (iDlayerIndex < iDlayerCount) {		
+		SDqIdc *pDqIdc		= &(*ppCtx)->pDqIdcMap[iDlayerIndex];		
+		const bool_t bUseSubsetSps			= (iDlayerIndex > BASE_DEPENDENCY_ID);
+		SDLayerParam *pDlayerParam	= &pParam->sDependencyLayers[iDlayerIndex];
+
+		pDqIdc->uiSpatialId	= iDlayerIndex;
+		pPps	= &(*ppCtx)->pPPSArray[iPpsId];
+		if ( !bUseSubsetSps )
+		{
+			pSps	= &(*ppCtx)->pSpsArray[iSpsId];
+		}
+		else
+		{
+			pSubsetSps	= &(*ppCtx)->pSubsetArray[iSpsId];
+			pSps			= &pSubsetSps->pSps;
+		}		
+
+		// Need port pSps/pPps initialization due to spatial scalability changed
+		if ( !bUseSubsetSps )
+		{	
+			WelsInitSps( pSps, pDlayerParam, pParam->uiIntraPeriod, pParam->iNumRefFrame, iSpsId, 
+						  pParam->bEnableFrameCroppingFlag, pParam->bEnableRc );
+
+			if( iDlayerCount > 1 )
+			{
+				pSps->bConstraintSet0Flag = true;
+				pSps->bConstraintSet1Flag = true;
+				pSps->bConstraintSet2Flag = true;
+			}
+		}
+		else
+		{
+			WelsInitSubsetSps( pSubsetSps, pDlayerParam, pParam->uiIntraPeriod, pParam->iNumRefFrame, iSpsId,
+								 pParam->bEnableFrameCroppingFlag, pParam->bEnableRc );
+		}
+
+		// initialize pPps
+		WelsInitPps( pPps, pSps, pSubsetSps, iPpsId, true, bUseSubsetSps );
+
+		// Not using FMO in SVC coding so far, come back if need FMO
+		{
+			iResult = InitSlicePEncCtx(	&(*ppCtx)->pSliceCtxList[iDlayerIndex],
+											(*ppCtx)->pMemAlign,
+											false,
+											pSps->iMbWidth,
+											pSps->iMbHeight,
+											&(pDlayerParam->sMso),
+											pPps	);
+			if ( iResult )
+			{
+				WelsLog( *ppCtx, WELS_LOG_WARNING, "InitDqLayers(), InitSlicePEncCtx failed(%d)!", iResult );
+				FreeMemorySvc( ppCtx );			
+				return 1;
+			}
+			(*ppCtx)->ppDqLayerList[iDlayerIndex]->pSliceEncCtx	= &(*ppCtx)->pSliceCtxList[iDlayerIndex];			
+		}
+		pDqIdc->iSpsId	= iSpsId;
+		pDqIdc->iPpsId	= iPpsId;
+
+		(*ppCtx)->sPSOVector.bPpsIdMappingIntoSubsetsps[iPpsId] = bUseSubsetSps;
+
+		if ( bUseSubsetSps )
+			++ iSpsId;
+		++ iPpsId;
+		++ (*ppCtx)->iSpsNum;
+		++ (*ppCtx)->iPpsNum;
+
+		++ iDlayerIndex;
+	}	
+	return 0;
+}
+
+int32_t AllocStrideTables( sWelsEncCtx **ppCtx, const int32_t kiNumSpatialLayers )
+{
+	CMemoryAlign *pMa				= (*ppCtx)->pMemAlign;
+	SWelsSvcCodingParam *pParam	= (*ppCtx)->pSvcParam;
+	SStrideTables *pPtr				= NULL;
+	int16_t *pTmpRow	= NULL, *pRowX = NULL, *pRowY = NULL, *p = NULL;
+	uint8_t *pBase		= NULL;
+	uint8_t *pBaseDec = NULL, *pBaseEnc = NULL, *pBaseMbX = NULL, *pBaseMbY = NULL;
+	struct {
+		int32_t iMbWidth;
+		int32_t iCountMbNum;				// count number of SMB in each spatial
+		int32_t iSizeAllMbAlignCache;	// cache line size aligned in each spatial
+	} sMbSizeMap[MAX_DEPENDENCY_LAYER] = {0};
+	int32_t iLineSizeY[MAX_DEPENDENCY_LAYER][2] = {0};
+	int32_t iLineSizeUV[MAX_DEPENDENCY_LAYER][2]= {0};
+	int32_t iMapSpatialIdx[MAX_DEPENDENCY_LAYER][2] = {0};	
+	int32_t iSizeDec		= 0;
+	int32_t iSizeEnc		= 0;
+	int32_t iCountLayersNeedCs[2]	= {0};
+	const int32_t kiUnit1Size = 24 * sizeof(int32_t);
+	int32_t iUnit2Size		= 0;
+	int32_t iNeedAllocSize	= 0;
+	int32_t iRowSize		= 0;
+	int16_t iMaxMbWidth	= 0;
+	int16_t iMaxMbHeight	= 0;
+	int32_t i				= 0;
+	int32_t iSpatialIdx		= 0;
+	int32_t iTemporalIdx	= 0;
+	int32_t iCntTid			= 0;	
+
+	if ( kiNumSpatialLayers <= 0 || kiNumSpatialLayers > MAX_DEPENDENCY_LAYER)
+		return 1;
+
+	pPtr = (SStrideTables *)pMa->WelsMalloc(sizeof(SStrideTables), "SStrideTables");
+	if (NULL == pPtr)
+		return 1;
+	(*ppCtx)->pStrideTab = pPtr;
+	
+	iCntTid	= pParam->iNumTemporalLayer > 1 ? 2 : 1;	
+
+	iSpatialIdx = 0;
+	while (iSpatialIdx < kiNumSpatialLayers) {
+		const int32_t kiTmpWidth = (pParam->sDependencyLayers[iSpatialIdx].iFrameWidth + 15) >> 4;
+		const int32_t kiTmpHeight= (pParam->sDependencyLayers[iSpatialIdx].iFrameHeight + 15) >> 4;
+		int32_t iNumMb = kiTmpWidth * kiTmpHeight;
+		
+		sMbSizeMap[iSpatialIdx].iMbWidth		= kiTmpWidth;
+		sMbSizeMap[iSpatialIdx].iCountMbNum	= iNumMb;
+		
+		iNumMb *= sizeof(int16_t);
+		sMbSizeMap[iSpatialIdx].iSizeAllMbAlignCache = iNumMb;		
+		iUnit2Size += iNumMb;
+
+		++ iSpatialIdx;
+	}
+	
+	// Adaptive size_cs, size_fdec by implementation dependency
+	iTemporalIdx= 0;
+	while ( iTemporalIdx < iCntTid )
+	{
+		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);		
+		
+		iSpatialIdx = 0;
+		while ( iSpatialIdx < kiNumSpatialLayers )
+		{
+			SDLayerParam *fDlp					= &pParam->sDependencyLayers[iSpatialIdx];			
+
+			const int32_t kiWidthPad = WELS_ALIGN( fDlp->iFrameWidth, 16 ) + (PADDING_LENGTH<<1);
+			iLineSizeY[iSpatialIdx][kbBaseTemporalFlag]	= WELS_ALIGN( kiWidthPad, 32 );
+			iLineSizeUV[iSpatialIdx][kbBaseTemporalFlag]= WELS_ALIGN( (kiWidthPad>>1), 16 );			
+
+			iMapSpatialIdx[iCountLayersNeedCs[kbBaseTemporalFlag]][kbBaseTemporalFlag] = iSpatialIdx;
+			++ iCountLayersNeedCs[kbBaseTemporalFlag];			
+			++ iSpatialIdx;
+		}
+		++ iTemporalIdx;
+	}
+	iSizeDec= kiUnit1Size * (iCountLayersNeedCs[0] + iCountLayersNeedCs[1]);
+	iSizeEnc= kiUnit1Size * kiNumSpatialLayers;
+
+	iNeedAllocSize = iSizeDec + iSizeEnc + (iUnit2Size << 1);
+
+	pBase = (uint8_t *)pMa->WelsMalloc( iNeedAllocSize, "pBase" );
+	if ( NULL == pBase )
+	{		
+		return 1;
+	}
+
+	pBaseDec= pBase;		// iCountLayersNeedCs
+	pBaseEnc= pBaseDec + iSizeDec;		// iNumSpatialLayers
+	pBaseMbX = pBaseEnc + iSizeEnc;	// iNumSpatialLayers
+	pBaseMbY = pBaseMbX + iUnit2Size;	// iNumSpatialLayers
+	
+	iTemporalIdx= 0;
+	while ( iTemporalIdx < iCntTid )
+	{
+		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);
+		
+		iSpatialIdx = 0;
+		while ( iSpatialIdx < iCountLayersNeedCs[kbBaseTemporalFlag] )
+		{
+			const int32_t kiActualSpatialIdx = iMapSpatialIdx[iSpatialIdx][kbBaseTemporalFlag];
+			const int32_t kiLumaWidth	= iLineSizeY[kiActualSpatialIdx][kbBaseTemporalFlag];
+			const int32_t kiChromaWidth	= iLineSizeUV[kiActualSpatialIdx][kbBaseTemporalFlag];
+
+			WelsGetEncBlockStrideOffset( (int32_t *)pBaseDec, kiLumaWidth, kiChromaWidth );
+
+			pPtr->pStrideDecBlockOffset[kiActualSpatialIdx][kbBaseTemporalFlag]	= (int32_t *)pBaseDec;
+			pBaseDec+= kiUnit1Size;
+
+			++ iSpatialIdx;
+		}
+		++ iTemporalIdx;
+	}
+	iTemporalIdx= 0;
+	while ( iTemporalIdx < iCntTid )
+	{
+		const bool_t kbBaseTemporalFlag	= (iTemporalIdx == 0);
+
+		iSpatialIdx = 0;
+		while (iSpatialIdx < kiNumSpatialLayers)
+		{
+			int32_t iMatchIndex = 0;			
+			bool_t bInMap = false;
+			bool_t bMatchFlag = false;
+
+			i = 0;
+			while ( i < iCountLayersNeedCs[kbBaseTemporalFlag] )
+			{			
+				const int32_t kiActualIdx = iMapSpatialIdx[i][kbBaseTemporalFlag];
+				if ( kiActualIdx == iSpatialIdx )
+				{
+					bInMap	= true;
+					break;
+				}
+				if ( !bMatchFlag )
+				{
+					iMatchIndex	= kiActualIdx;
+					bMatchFlag	= true;
+				}
+				++ i;
+			}
+
+			if ( bInMap )
+			{
+				++ iSpatialIdx;
+				continue;
+			}
+
+			// not in spatial map and assign match one to it
+			pPtr->pStrideDecBlockOffset[iSpatialIdx][kbBaseTemporalFlag]	= pPtr->pStrideDecBlockOffset[iMatchIndex][kbBaseTemporalFlag];
+
+			++ iSpatialIdx;
+		}
+		++ iTemporalIdx;
+	}
+	
+	iSpatialIdx = 0;
+	while ( iSpatialIdx < kiNumSpatialLayers )
+	{		
+		const int32_t kiAllocMbSize = sMbSizeMap[iSpatialIdx].iSizeAllMbAlignCache;
+
+		pPtr->pStrideEncBlockOffset[iSpatialIdx]	= (int32_t *)pBaseEnc;
+		
+		pPtr->pMbIndexX[iSpatialIdx]				= (int16_t *)pBaseMbX;
+		pPtr->pMbIndexY[iSpatialIdx]				= (int16_t *)pBaseMbY;
+
+		pBaseEnc += kiUnit1Size;
+		pBaseMbX += kiAllocMbSize;
+		pBaseMbY += kiAllocMbSize;
+		
+		++ iSpatialIdx;		
+	}
+	
+	while ( iSpatialIdx < MAX_DEPENDENCY_LAYER )
+	{
+		pPtr->pStrideDecBlockOffset[iSpatialIdx][0]	= NULL;
+		pPtr->pStrideDecBlockOffset[iSpatialIdx][1]	= NULL;		
+		pPtr->pStrideEncBlockOffset[iSpatialIdx]		= NULL;
+		pPtr->pMbIndexX[iSpatialIdx]					= NULL;
+		pPtr->pMbIndexY[iSpatialIdx]					= NULL;
+
+		++ iSpatialIdx;
+	}
+
+	// initialize pMbIndexX and pMbIndexY tables as below
+
+	iMaxMbWidth	= sMbSizeMap[kiNumSpatialLayers-1].iMbWidth;
+	iMaxMbWidth	= WELS_ALIGN(iMaxMbWidth, 4);	// 4 loops for int16_t required introduced as below
+	iRowSize		= iMaxMbWidth * sizeof(int16_t);
+
+	pTmpRow = (int16_t*)pMa->WelsMalloc( iRowSize, "pTmpRow" );
+	if ( NULL == pTmpRow )
+	{		
+		return 1;
+	}
+	pRowX = pTmpRow;
+	pRowY = pRowX;
+	// initialize pRowX & pRowY
+	i = 0;
+	p = pRowX;
+	while ( i < iMaxMbWidth )
+	{
+		*p		= i;
+		*(p+1)	= 1+i;
+		*(p+2)	= 2+i;
+		*(p+3)	= 3+i;
+		
+		p += 4;
+		i += 4;
+	}
+
+	iSpatialIdx = kiNumSpatialLayers;
+	while ( --iSpatialIdx >= 0 )
+	{
+		int16_t *pMbIndexX = pPtr->pMbIndexX[iSpatialIdx];
+		const int32_t kiMbWidth	= sMbSizeMap[iSpatialIdx].iMbWidth;
+		const int32_t kiMbHeight	= sMbSizeMap[iSpatialIdx].iCountMbNum / kiMbWidth;
+		const int32_t kiLineSize	= kiMbWidth * sizeof(int16_t);
+
+		i = 0;
+		while ( i < kiMbHeight )
+		{
+			memcpy( pMbIndexX, pRowX, kiLineSize );	// confirmed_safe_unsafe_usage
+
+			pMbIndexX += kiMbWidth;			
+			++ i;
+		}		
+	}
+
+	memset(pRowY, 0, iRowSize);
+	iMaxMbHeight	= sMbSizeMap[kiNumSpatialLayers-1].iCountMbNum / sMbSizeMap[kiNumSpatialLayers-1].iMbWidth;
+	i = 0;
+	for (;;)
+	{
+		ENFORCE_STACK_ALIGN_1D(int16_t, t, 4, 16)
+
+		int32_t t32 = 0;
+		int16_t j = 0;
+
+		for ( iSpatialIdx = kiNumSpatialLayers-1; iSpatialIdx >= 0; -- iSpatialIdx )
+		{
+			const int32_t kiMbWidth	= sMbSizeMap[iSpatialIdx].iMbWidth;
+			const int32_t kiMbHeight = sMbSizeMap[iSpatialIdx].iCountMbNum / kiMbWidth;
+			const int32_t kiLineSize	= kiMbWidth * sizeof(int16_t);
+			int16_t *pMbIndexY = pPtr->pMbIndexY[iSpatialIdx] + i * kiMbWidth;
+
+			if ( i < kiMbHeight )
+			{
+				memcpy( pMbIndexY, pRowY, kiLineSize );	// confirmed_safe_unsafe_usage
+			}
+		}		
+		++ i;
+		if (i >= iMaxMbHeight)
+			break;
+
+		t32 = i | (i << 16);
+		ST32( t  , t32 );
+		ST32( t+2, t32 );
+
+		p = pRowY;
+		while ( j < iMaxMbWidth )
+		{			
+			ST64( p, LD64(t) );
+			
+			p += 4;
+			j += 4;
+		}
+	}
+
+	pMa->WelsFree( pTmpRow, "pTmpRow" );
+	pTmpRow = NULL;
+
+	return 0;
+}
+
+/*!
+ * \brief	request specific memory for SVC
+ * \pParam	pEncCtx		sWelsEncCtx*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t RequestMemorySvc( sWelsEncCtx **ppCtx )
+{
+	SWelsSvcCodingParam *pParam	= (*ppCtx)->pSvcParam;
+	CMemoryAlign *pMa				= (*ppCtx)->pMemAlign;
+	SDLayerParam *pFinalSpatial	= NULL;
+	int32_t iCountBsLen			= 0;
+	int32_t iCountNals				= 0;
+	int32_t iMaxPicWidth			= 0;
+	int32_t iMaxPicHeight			= 0;
+	int32_t iCountMaxMbNum		= 0;
+	int32_t iIndex					= 0;
+	int32_t iCountLayers			= 0;
+	int32_t iResult					= 0;
+	float	fCompressRatioThr		= .5f;
+	const int32_t kiNumDependencyLayers	= pParam->iNumDependencyLayer;
+	const uint32_t kuiMvdInterTableSize	=  ( kiNumDependencyLayers == 1 ? (1 + (648 << 1)) : (1 + (972 << 1)) );	
+	const uint32_t kuiMvdCacheAlginedSize	= kuiMvdInterTableSize * sizeof(uint16_t);
+	int32_t iVclLayersBsSizeCount		= 0;
+	int32_t iNonVclLayersBsSizeCount	= 0;	
+#if defined(MT_ENABLED)
+	int32_t iTargetSpatialBsSize			= 0;
+#endif//MT_ENABLED
+
+	if ( kiNumDependencyLayers < 1 || kiNumDependencyLayers > MAX_DEPENDENCY_LAYER )
+	{
+		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc() failed due to invalid iNumDependencyLayers(%d)!\n", kiNumDependencyLayers);
+		FreeMemorySvc( ppCtx );		
+		return 1;
+	}
+
+	if ( pParam->uiGopSize == 0 || ( pParam->uiIntraPeriod && ((pParam->uiIntraPeriod % pParam->uiGopSize) != 0)) )
+	{
+		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc() failed due to invalid uiIntraPeriod(%d) (=multipler of uiGopSize(%d)!",
+			pParam->uiIntraPeriod, pParam->uiGopSize);
+		FreeMemorySvc( ppCtx );		
+		return 1;
+	}
+
+	pFinalSpatial	= &pParam->sDependencyLayers[kiNumDependencyLayers - 1];
+	iMaxPicWidth	= pFinalSpatial->iFrameWidth;
+	iMaxPicHeight	= pFinalSpatial->iFrameHeight;
+	iCountMaxMbNum= ((15+iMaxPicWidth)>>4) * ((15+iMaxPicHeight)>>4);
+
+	iResult = AcquireLayersNals( ppCtx, pParam, &iCountLayers, &iCountNals );
+	if ( iResult )
+	{
+		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), AcquireLayersNals failed(%d)!", iResult);
+		FreeMemorySvc( ppCtx );		
+		return 1;
+	}	
+	
+	iNonVclLayersBsSizeCount = SSEI_BUFFER_SIZE + pParam->iNumDependencyLayer * SPS_BUFFER_SIZE + (1+pParam->iNumDependencyLayer) * PPS_BUFFER_SIZE;
+
+	int32_t iLayerBsSize = 0;
+	iIndex = 0;
+	while(iIndex < pParam->iNumDependencyLayer)
+	{
+		SDLayerParam *fDlp = &pParam->sDependencyLayers[iIndex];		
+
+		fCompressRatioThr	= COMPRESS_RATIO_DECIDED_BY_RESOLUTION(fDlp->iFrameWidth, fDlp->iFrameHeight);
+
+		iLayerBsSize = WELS_ROUND( ( (3 * fDlp->iFrameWidth * fDlp->iFrameHeight)>>1) * fCompressRatioThr);
+		iLayerBsSize	= WELS_ALIGN(iLayerBsSize, 4);			// 4 bytes alinged		
+		iVclLayersBsSizeCount += iLayerBsSize;
+		++ iIndex;
+	}
+#if defined(MT_ENABLED)
+	iTargetSpatialBsSize = iLayerBsSize;
+#endif//MT_ENABLED
+	iCountBsLen = iNonVclLayersBsSizeCount + iVclLayersBsSizeCount;
+
+	pParam->iNumRefFrame	= WELS_CLIP3(pParam->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
+		
+	// Output
+	(*ppCtx)->pOut = (SWelsEncoderOutput *)pMa->WelsMalloc( sizeof(SWelsEncoderOutput), "SWelsEncoderOutput" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut), FreeMemorySvc(ppCtx) )
+	(*ppCtx)->pOut->pBsBuffer		= (uint8_t *)pMa->WelsMalloc( iCountBsLen, "pOut->pBsBuffer" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut->pBsBuffer), FreeMemorySvc(ppCtx) )
+	(*ppCtx)->pOut->uiSize			= iCountBsLen;	
+	(*ppCtx)->pOut->sNalList		= (SWelsNalRaw *)pMa->WelsMalloc( iCountNals * sizeof(SWelsNalRaw), "pOut->sNalList" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pOut->sNalList), FreeMemorySvc(ppCtx) )
+	(*ppCtx)->pOut->iCountNals		= iCountNals;
+	(*ppCtx)->pOut->iNalIndex		= 0;
+
+#ifdef MT_ENABLED
+	if ( pParam->iMultipleThreadIdc > 1 )
+	{
+		(*ppCtx)->pFrameBs			= (uint8_t *)pMa->WelsMalloc( iCountBsLen + (iTargetSpatialBsSize * ((*ppCtx)->iMaxSliceCount-1)), "pFrameBs" );
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pFrameBs), FreeMemorySvc(ppCtx) )
+		(*ppCtx)->iFrameBsSize		= iCountBsLen * (*ppCtx)->iMaxSliceCount;
+	}
+	else
+#endif//MT_ENABLED
+	{	
+		(*ppCtx)->pFrameBs			= (uint8_t *)pMa->WelsMalloc( iCountBsLen, "pFrameBs" );
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pFrameBs), FreeMemorySvc(ppCtx) )
+		(*ppCtx)->iFrameBsSize		= iCountBsLen;
+	}
+	(*ppCtx)->iPosBsBuffer		= 0;
+
+#ifdef MT_ENABLED
+	// for pSlice bs buffers
+	if ( pParam->iMultipleThreadIdc > 1 && RequestMtResource( ppCtx, pParam, iCountBsLen, iTargetSpatialBsSize ) )
+	{
+		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), RequestMtResource failed!");
+		FreeMemorySvc( ppCtx );
+		return 1;
+	}
+#endif
+		
+	(*ppCtx)->pIntra4x4PredModeBlocks = static_cast<int8_t*>
+		(pMa->WelsMallocz( iCountMaxMbNum * INTRA_4x4_MODE_NUM, "pIntra4x4PredModeBlocks" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pIntra4x4PredModeBlocks), FreeMemorySvc(ppCtx) )
+
+	(*ppCtx)->pNonZeroCountBlocks = static_cast<int8_t*>
+		(pMa->WelsMallocz( iCountMaxMbNum * MB_LUMA_CHROMA_BLOCK4x4_NUM, "pNonZeroCountBlocks" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pNonZeroCountBlocks), FreeMemorySvc(ppCtx) )
+
+	(*ppCtx)->pMvUnitBlock4x4 = static_cast<SMVUnitXY*>
+		(pMa->WelsMallocz( iCountMaxMbNum * 2 * MB_BLOCK4x4_NUM * sizeof(SMVUnitXY), "pMvUnitBlock4x4" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pMvUnitBlock4x4), FreeMemorySvc(ppCtx) )
+
+	(*ppCtx)->pRefIndexBlock4x4 = static_cast<int8_t*>
+		(pMa->WelsMallocz( iCountMaxMbNum * 2 * MB_BLOCK8x8_NUM * sizeof(int8_t), "pRefIndexBlock4x4" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pRefIndexBlock4x4), FreeMemorySvc(ppCtx) )
+
+	(*ppCtx)->pSadCostMb	= static_cast<int32_t*>
+		  (pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pSadCostMb" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSadCostMb), FreeMemorySvc(ppCtx))
+
+	(*ppCtx)->bEncCurFrmAsIdrFlag = true;  // make sure first frame is IDR
+	(*ppCtx)->iGlobalQp				= 26;	// global qp in default
+
+	(*ppCtx)->pLtr = (SLTRState *)pMa->WelsMalloc( kiNumDependencyLayers*sizeof(SLTRState), "SLTRState" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pLtr), FreeMemorySvc(ppCtx) )
+	int32_t i = 0;
+	for( i = 0; i < kiNumDependencyLayers; i++ )
+	{
+		ResetLtrState( &(*ppCtx)->pLtr[i] );
+	}
+
+	(*ppCtx)->ppRefPicListExt	= (SRefList**)pMa->WelsMalloc( kiNumDependencyLayers * sizeof(SRefList *), "ppRefPicListExt" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->ppRefPicListExt), FreeMemorySvc(ppCtx) )
+
+	// pSlice context list
+	(*ppCtx)->pSliceCtxList	= (SSliceCtx *)pMa->WelsMallocz( kiNumDependencyLayers * sizeof(SSliceCtx), "pSliceCtxList" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSliceCtxList), FreeMemorySvc(ppCtx) )
+
+	(*ppCtx)->ppDqLayerList	= (SDqLayer **)pMa->WelsMalloc( kiNumDependencyLayers * sizeof(SDqLayer *), "ppDqLayerList" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->ppDqLayerList), FreeMemorySvc(ppCtx) )
+
+	// stride tables
+	if ( AllocStrideTables( ppCtx, kiNumDependencyLayers ) )
+	{
+		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), AllocStrideTables failed!");
+		FreeMemorySvc( ppCtx );
+		return 1;
+	}
+	
+	//Rate control module memory allocation
+	// only malloc once for RC pData, 12/14/2009
+	(*ppCtx)->pWelsSvcRc = (SWelsSvcRc *)pMa->WelsMallocz( kiNumDependencyLayers * sizeof(SWelsSvcRc), "pWelsSvcRc" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pWelsSvcRc), FreeMemorySvc(ppCtx) )
+	//End of Rate control module memory allocation
+	
+	//pVaa memory allocation	
+	(*ppCtx)->pVaa	= (SVAAFrameInfo *)pMa->WelsMallocz( sizeof(SVAAFrameInfo), "pVaa" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa), FreeMemorySvc(ppCtx) )
+
+	if((*ppCtx)->pSvcParam->bEnableAdaptiveQuant)//malloc mem
+	{
+		(*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureUnit   = static_cast<SMotionTextureUnit*>
+			(pMa->WelsMallocz( iCountMaxMbNum * sizeof(SMotionTextureUnit), "pVaa->sAdaptiveQuantParam.pMotionTextureUnit" ));
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureUnit), FreeMemorySvc(ppCtx) )
+		(*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp   = static_cast<int8_t*>
+			(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int8_t), "pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp" ));
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp), FreeMemorySvc(ppCtx) )
+	}
+
+	(*ppCtx)->pVaa->pVaaBackgroundMbFlag = (int8_t *)pMa->WelsMallocz( iCountMaxMbNum * sizeof(int8_t), "pVaa->vaa_skip_mb_flag" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->pVaaBackgroundMbFlag), FreeMemorySvc(ppCtx) )
+
+	(*ppCtx)->pVaa->sVaaCalcInfo.pSad8x8 = static_cast<int32_t(*)[4]>
+	    (pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(int32_t), "pVaa->sVaaCalcInfo.sad8x8" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSad8x8), FreeMemorySvc(ppCtx) )
+	(*ppCtx)->pVaa->sVaaCalcInfo.pSsd16x16 = static_cast<int32_t*>
+		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSsd16x16" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSsd16x16), FreeMemorySvc(ppCtx) )
+	(*ppCtx)->pVaa->sVaaCalcInfo.pSum16x16 = static_cast<int32_t*>
+		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSum16x16" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSum16x16), FreeMemorySvc(ppCtx) )
+	(*ppCtx)->pVaa->sVaaCalcInfo.pSumOfSquare16x16 = static_cast<int32_t*>
+		(pMa->WelsMallocz( iCountMaxMbNum * sizeof(int32_t), "pVaa->sVaaCalcInfo.pSumOfSquare16x16" ));
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfSquare16x16), FreeMemorySvc(ppCtx) )
+
+	if ((*ppCtx)->pSvcParam->bEnableBackgroundDetection)  //BGD control
+	{
+		(*ppCtx)->pVaa->sVaaCalcInfo.pSumOfDiff8x8 = static_cast<int32_t(*)[4]>
+			(pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(int32_t), "pVaa->sVaaCalcInfo.sd_16x16" ));
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pSumOfDiff8x8), FreeMemorySvc(ppCtx) )
+		(*ppCtx)->pVaa->sVaaCalcInfo.pMad8x8 = static_cast<uint8_t(*)[4]>
+			(pMa->WelsMallocz( iCountMaxMbNum * 4 * sizeof(uint8_t), "pVaa->sVaaCalcInfo.mad_16x16" ));
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pVaa->sVaaCalcInfo.pMad8x8), FreeMemorySvc(ppCtx) )
+	}
+
+	//End of pVaa memory allocation
+	
+	iResult = InitDqLayers( ppCtx );
+	if ( iResult )
+	{
+		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), InitDqLayers failed(%d)!", iResult );
+		FreeMemorySvc( ppCtx );
+		return iResult;
+	}	
+
+	if( InitMbListD( ppCtx ) )
+	{
+		WelsLog( *ppCtx, WELS_LOG_WARNING, "RequestMemorySvc(), InitMbListD failed!" );
+		FreeMemorySvc( ppCtx );
+		return 1;
+	}
+
+	(*ppCtx)->pMvdCostTableInter = (uint16_t *)pMa->WelsMallocz( 52 * kuiMvdCacheAlginedSize, "pMvdCostTableInter" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pMvdCostTableInter), FreeMemorySvc(ppCtx) )
+	MvdCostInit( (*ppCtx)->pMvdCostTableInter, kuiMvdInterTableSize ); //should put to a better place?		
+	
+	if ( (*ppCtx)->ppRefPicListExt[0] != NULL && (*ppCtx)->ppRefPicListExt[0]->pRef[0] != NULL )
+		(*ppCtx)->pDecPic				= (*ppCtx)->ppRefPicListExt[0]->pRef[0];
+	else
+		(*ppCtx)->pDecPic				= NULL;	// error here
+
+	(*ppCtx)->pSps				= &(*ppCtx)->pSpsArray[0];
+	(*ppCtx)->pPps				= &(*ppCtx)->pPPSArray[0];
+
+	return 0;
+}
+
+
+/*!
+ * \brief	free memory	in SVC core encoder
+ * \pParam	pEncCtx		sWelsEncCtx*
+ * \return	none
+ */
+void FreeMemorySvc( sWelsEncCtx **ppCtx )
+{
+	if ( NULL != *ppCtx )
+	{
+		sWelsEncCtx *pCtx	= *ppCtx;
+		CMemoryAlign *pMa			= pCtx->pMemAlign;
+		SWelsSvcCodingParam *pParam= pCtx->pSvcParam;
+		int32_t ilayer				= 0;        
+		
+		// SStrideTables
+		if ( NULL != pCtx->pStrideTab )
+		{
+			if ( NULL != pCtx->pStrideTab->pStrideDecBlockOffset[0][1] )
+			{
+				pMa->WelsFree( pCtx->pStrideTab->pStrideDecBlockOffset[0][1], "pBase" );
+				pCtx->pStrideTab->pStrideDecBlockOffset[0][1] = NULL;
+			}
+			pMa->WelsFree(pCtx->pStrideTab, "SStrideTables");
+			pCtx->pStrideTab = NULL;
+		}
+		// pDq idc map
+		if ( NULL != pCtx->pDqIdcMap )
+		{
+			pMa->WelsFree( pCtx->pDqIdcMap, "pDqIdcMap" );
+			pCtx->pDqIdcMap = NULL;
+		}
+
+		if ( NULL != pCtx->pOut )
+		{		
+			// bs pBuffer
+			if ( NULL != pCtx->pOut->pBsBuffer )
+			{
+				pMa->WelsFree( pCtx->pOut->pBsBuffer, "pOut->pBsBuffer" );
+				pCtx->pOut->pBsBuffer = NULL;
+			}
+			// NALs list
+			if ( NULL != pCtx->pOut->sNalList )
+			{
+				pMa->WelsFree( pCtx->pOut->sNalList, "pOut->sNalList" );
+				pCtx->pOut->sNalList = NULL;
+			}
+			pMa->WelsFree( pCtx->pOut, "SWelsEncoderOutput" );
+			pCtx->pOut = NULL;
+		}
+
+#ifdef MT_ENABLED
+		if ( pParam != NULL && pParam->iMultipleThreadIdc > 1 )
+			ReleaseMtResource( ppCtx );
+#endif//MT_ENABLED
+
+		// frame bitstream pBuffer
+		if ( NULL != pCtx->pFrameBs )
+		{
+			pMa->WelsFree( pCtx->pFrameBs, "pFrameBs" );
+			pCtx->pFrameBs = NULL;
+		}
+
+		// pSpsArray
+		if ( NULL != pCtx->pSpsArray )
+		{
+			pMa->WelsFree( pCtx->pSpsArray, "pSpsArray" );
+			pCtx->pSpsArray = NULL;
+		}
+		// pPPSArray
+		if ( NULL != pCtx->pPPSArray )
+		{
+			pMa->WelsFree( pCtx->pPPSArray, "pPPSArray" );
+			pCtx->pPPSArray = NULL;
+		}
+		// subset_sps_array
+		if ( NULL != pCtx->pSubsetArray )
+		{
+			pMa->WelsFree( pCtx->pSubsetArray, "pSubsetArray" );
+			pCtx->pSubsetArray = NULL;
+		}
+
+		if( NULL != pCtx->pIntra4x4PredModeBlocks ){
+			pMa->WelsFree( pCtx->pIntra4x4PredModeBlocks, "pIntra4x4PredModeBlocks" );
+			pCtx->pIntra4x4PredModeBlocks = NULL;
+		}
+
+		if( NULL != pCtx->pNonZeroCountBlocks ){
+			pMa->WelsFree( pCtx->pNonZeroCountBlocks, "pNonZeroCountBlocks" );
+			pCtx->pNonZeroCountBlocks = NULL;
+		}
+
+		if ( NULL != pCtx->pMvUnitBlock4x4)
+		{
+			pMa->WelsFree( pCtx->pMvUnitBlock4x4, "pMvUnitBlock4x4" );
+			pCtx->pMvUnitBlock4x4	= NULL;
+		}
+
+		if ( NULL != pCtx->pRefIndexBlock4x4)
+		{
+			pMa->WelsFree( pCtx->pRefIndexBlock4x4, "pRefIndexBlock4x4" );
+			pCtx->pRefIndexBlock4x4	= NULL;
+		}
+			
+		if ( NULL != pCtx->ppMbListD )
+		{		
+			if( NULL != pCtx->ppMbListD[0] ){
+				pMa->WelsFree( pCtx->ppMbListD[0], "ppMbListD[0]" );
+				(*ppCtx)->ppMbListD[0] = NULL;
+			}
+			pMa->WelsFree( pCtx->ppMbListD, "ppMbListD" );
+			pCtx->ppMbListD = NULL;
+		}
+
+		if ( NULL != pCtx->pSadCostMb)
+		{
+			pMa->WelsFree( pCtx->pSadCostMb, "pSadCostMb" );
+			pCtx->pSadCostMb = NULL;
+		}
+
+		// SLTRState
+		if ( NULL != pCtx->pLtr )
+		{
+			pMa->WelsFree( pCtx->pLtr, "SLTRState" );
+			pCtx->pLtr = NULL;
+		}
+
+		// pDq layers list
+		ilayer = 0;
+		if ( NULL != pCtx->ppDqLayerList && pParam != NULL )
+		{			
+			while (ilayer < pParam->iNumDependencyLayer) {
+				SDqLayer *pDq	= pCtx->ppDqLayerList[ilayer];
+				SDLayerParam *pDlp = &pCtx->pSvcParam->sDependencyLayers[ilayer];
+				const BOOL_T kbIsDynamicSlicing = (SM_DYN_SLICE == pDlp->sMso.uiSliceMode);
+				
+				// pDq layers
+				if ( NULL != pDq )
+				{
+					if ( NULL != pDq->sLayerInfo.pSliceInLayer )
+					{
+						int32_t iSliceIdx = 0;
+						int32_t iSliceNum = GetInitialSliceNum( pDq->iMbWidth, pDq->iMbHeight, &pDlp->sMso );
+						if (iSliceNum < 1)
+							iSliceNum = 1;
+						while(iSliceIdx < iSliceNum)
+						{
+							SSlice *pSlice = &pDq->sLayerInfo.pSliceInLayer[iSliceIdx];
+							FreeMbCache(&pSlice->sMbCacheInfo, pMa);
+							++ iSliceIdx;
+						}
+						pMa->WelsFree( pDq->sLayerInfo.pSliceInLayer, "pSliceInLayer" );
+						pDq->sLayerInfo.pSliceInLayer = NULL;
+					}
+					if ( kbIsDynamicSlicing )
+					{
+						pMa->WelsFree( pDq->pNumSliceCodedOfPartition, "pNumSliceCodedOfPartition" );
+						pDq->pNumSliceCodedOfPartition	= NULL;
+						pMa->WelsFree( pDq->pLastCodedMbIdxOfPartition, "pLastCodedMbIdxOfPartition" );
+						pDq->pLastCodedMbIdxOfPartition	= NULL;
+						pMa->WelsFree( pDq->pLastMbIdxOfPartition, "pLastMbIdxOfPartition" );						
+						pDq->pLastMbIdxOfPartition = NULL;
+					}
+
+					pMa->WelsFree( pDq, "pDq" );
+					pDq = NULL;
+					pCtx->ppDqLayerList[ilayer] = NULL;
+				}				
+				++ ilayer;
+			}
+			pMa->WelsFree( pCtx->ppDqLayerList, "ppDqLayerList" );
+			pCtx->ppDqLayerList = NULL;
+		}
+		FreeSpatialPictures( pCtx );		
+
+		// reference picture list extension
+		if ( NULL != pCtx->ppRefPicListExt && pParam != NULL )
+		{
+			ilayer = 0;
+			while (ilayer < pParam->iNumDependencyLayer) {
+				SRefList *pRefList		= pCtx->ppRefPicListExt[ilayer];
+				if ( NULL != pRefList )
+				{
+					int32_t iRef = 0;
+					do {
+						if ( pRefList->pRef[iRef] != NULL )
+						{
+							FreePicture( pMa, &pRefList->pRef[iRef] );
+						}
+						++ iRef;
+					} while(iRef < 1 + pParam->iNumRefFrame);
+
+					pMa->WelsFree( pCtx->ppRefPicListExt[ilayer], "ppRefPicListExt[]" );
+					pCtx->ppRefPicListExt[ilayer] = NULL;
+				}				
+				++ ilayer;
+			}	
+
+			pMa->WelsFree( pCtx->ppRefPicListExt, "ppRefPicListExt" );
+			pCtx->ppRefPicListExt = NULL;
+		}
+		
+		// pSlice context list
+		if ( NULL != pCtx->pSliceCtxList && pParam != NULL )
+		{
+			ilayer = 0;
+			while (ilayer < pParam->iNumDependencyLayer) {
+				SSliceCtx *pSliceCtx	= &pCtx->pSliceCtxList[ilayer];
+				if ( NULL != pSliceCtx )
+					UninitSlicePEncCtx( pSliceCtx, pMa );				
+				++ ilayer;
+			}
+			pMa->WelsFree( pCtx->pSliceCtxList, "pSliceCtxList" );
+			pCtx->pSliceCtxList = NULL;
+		}		
+
+		// VAA
+		if ( NULL != pCtx->pVaa )
+		{
+			if(pCtx->pSvcParam->bEnableAdaptiveQuant)//free mem
+			{
+				pMa->WelsFree( pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureUnit, "pVaa->sAdaptiveQuantParam.pMotionTextureUnit" );
+				pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureUnit = NULL;
+				pMa->WelsFree( pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp, "pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp" );
+				pCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp = NULL;
+			}
+
+			pMa->WelsFree( pCtx->pVaa->pVaaBackgroundMbFlag, "pVaa->pVaaBackgroundMbFlag");
+			pCtx->pVaa->pVaaBackgroundMbFlag	= NULL;
+			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSad8x8, "pVaa->sVaaCalcInfo.sad8x8" );
+			pCtx->pVaa->sVaaCalcInfo.pSad8x8		= NULL;
+			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSsd16x16, "pVaa->sVaaCalcInfo.pSsd16x16" );
+			pCtx->pVaa->sVaaCalcInfo.pSsd16x16	= NULL;
+			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSum16x16, "pVaa->sVaaCalcInfo.pSum16x16" );
+			pCtx->pVaa->sVaaCalcInfo.pSum16x16	= NULL;
+			pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSumOfSquare16x16, "pVaa->sVaaCalcInfo.pSumOfSquare16x16" );
+			pCtx->pVaa->sVaaCalcInfo.pSumOfSquare16x16		= NULL;
+
+			if (pCtx->pSvcParam->bEnableBackgroundDetection) //BGD control
+			{
+				pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pSumOfDiff8x8, "pVaa->sVaaCalcInfo.pSumOfDiff8x8" );
+				pCtx->pVaa->sVaaCalcInfo.pSumOfDiff8x8	= NULL;
+				pMa->WelsFree( pCtx->pVaa->sVaaCalcInfo.pMad8x8, "pVaa->sVaaCalcInfo.pMad8x8" );
+				pCtx->pVaa->sVaaCalcInfo.pMad8x8	= NULL;
+			}
+
+			pMa->WelsFree( pCtx->pVaa, "pVaa" );
+			pCtx->pVaa = NULL;
+		}
+
+		WelsRcFreeMemory(pCtx);
+		// rate control module memory free
+		if ( NULL != pCtx->pWelsSvcRc )
+		{
+			pMa->WelsFree( pCtx->pWelsSvcRc, "pWelsSvcRc" );
+			pCtx->pWelsSvcRc = NULL;
+		}
+
+		/* MVD cost tables for Inter */
+		if ( NULL != pCtx->pMvdCostTableInter )
+		{
+			pMa->WelsFree( pCtx->pMvdCostTableInter, "pMvdCostTableInter" );
+			pCtx->pMvdCostTableInter = NULL;
+		}
+
+#ifdef ENABLE_TRACE_FILE
+		if ( NULL != pCtx->pFileLog )
+		{
+			fclose( pCtx->pFileLog );
+			pCtx->pFileLog	= NULL;
+		}
+		pCtx->uiSizeLog	= 0;
+#endif//ENABLE_TRACE_FILE
+
+		FreeCodingParam( &pCtx->pSvcParam, pMa );
+		if ( NULL != pCtx->pFuncList )
+		{
+			pMa->WelsFree(pCtx->pFuncList, "SWelsFuncPtrList");
+			pCtx->pFuncList = NULL;
+		}
+
+#if defined(MEMORY_MONITOR)
+		assert(pMa->WelsGetMemoryUsage() == 0);	// ensure all memory free well
+#endif//MEMORY_MONITOR		
+
+		if ( (*ppCtx)->pMemAlign != NULL )
+		{
+			WelsLog( NULL, WELS_LOG_INFO, "FreeMemorySvc(), verify memory usage (%d bytes) after free..\n", (*ppCtx)->pMemAlign->WelsGetMemoryUsage() );
+			delete (*ppCtx)->pMemAlign;
+			(*ppCtx)->pMemAlign = NULL;
+		}
+
+		free(*ppCtx);
+		*ppCtx = NULL;
+	}
+}
+
+int32_t InitSliceSettings( SWelsSvcCodingParam *pCodingParam, const int32_t kiCpuCores, int16_t *pMaxSliceCount )
+{
+	int32_t iSpatialIdx = 0, iSpatialNum = pCodingParam->iNumDependencyLayer;
+	int16_t iMaxSliceCount = 0;
+		
+	do {
+		SDLayerParam *pDlp				= &pCodingParam->sDependencyLayers[iSpatialIdx];
+		SMulSliceOption *pMso			= &pDlp->sMso;
+		SSliceArgument *pSlcArg			= &pMso->sSliceArgument;
+		const int32_t kiMbWidth			= (pDlp->iFrameWidth+15)>>4;
+		const int32_t kiMbHeight			= (pDlp->iFrameHeight+15)>>4;
+		const int32_t kiMbNumInFrame	= kiMbWidth * kiMbHeight;
+#if defined(MT_ENABLED)
+#if defined(DYNAMIC_SLICE_ASSIGN)
+		int32_t iSliceNum				= (SM_FIXEDSLCNUM_SLICE == pMso->uiSliceMode || SM_DYN_SLICE == pMso->uiSliceMode) ? kiCpuCores : pSlcArg->iSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
+#else//!DYNAMIC_SLICE_ASSIGN
+		int32_t iSliceNum				= (SM_DYN_SLICE == pMso->uiSliceMode) ? kiCpuCores : pSlcArg->uiSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
+#endif//DYNAMIC_SLICE_ASSIGN
+#else//!MT_ENABLED
+		int16_t iSliceNum				= pSlcArg->iSliceNum; // uiSliceNum per input has been validated at ParamValidationExt()
+#endif//MT_ENABLED
+
+		// NOTE: Per design, in case MT/DYNAMIC_SLICE_ASSIGN enabled, for SM_FIXEDSLCNUM_SLICE mode, 
+		// uiSliceNum of current spatial layer settings equals to uiCpuCores number; SM_DYN_SLICE mode,
+		// uiSliceNum intials as uiCpuCores also, stay tuned dynamically slicing in future
+		pSlcArg->iSliceNum	= iSliceNum;	// used fixed one
+
+		switch(pMso->uiSliceMode)
+		{
+		case SM_DYN_SLICE:
+			iMaxSliceCount	= AVERSLICENUM_CONSTRAINT;
+//#ifndef MT_ENABLED
+			break;	// go through for MT_ENABLED & SM_DYN_SLICE?
+//#endif//MT_ENABLED
+		case SM_FIXEDSLCNUM_SLICE:
+			if ( iSliceNum > iMaxSliceCount )
+				iMaxSliceCount = iSliceNum;
+			// need perform check due uiSliceNum might change, although has been initialized somewhere outside
+			if (pCodingParam->bEnableRc)
+			{
+				GomValidCheckSliceMbNum( kiMbWidth, kiMbHeight, pSlcArg );						
+			}			
+			else
+			{			
+				CheckFixedSliceNumMultiSliceSetting( kiMbNumInFrame, pSlcArg );
+			}			
+			break;
+		case SM_SINGLE_SLICE:
+			if ( iSliceNum > iMaxSliceCount )
+				iMaxSliceCount = iSliceNum;
+			break;
+		case SM_RASTER_SLICE:
+			if ( iSliceNum > iMaxSliceCount )
+				iMaxSliceCount = iSliceNum;
+			break;
+		case SM_ROWMB_SLICE:
+			if ( iSliceNum > iMaxSliceCount )
+				iMaxSliceCount = iSliceNum;
+			break;
+		default:
+			break;
+		}			
+
+		++ iSpatialIdx;
+	} while(iSpatialIdx < iSpatialNum);	
+
+#ifdef MT_ENABLED	
+	pCodingParam->iCountThreadsNum				= WELS_MIN(kiCpuCores, iMaxSliceCount);
+	pCodingParam->iMultipleThreadIdc	= pCodingParam->iCountThreadsNum;
+#else
+	pCodingParam->iMultipleThreadIdc	= 1;
+	pCodingParam->iCountThreadsNum				= 1;
+#endif//MT_ENABLED
+
+#ifndef WELS_TESTBED	// for product release and non-SGE testing
+	
+	if ( kiCpuCores < 2 )	// single CPU core, make no sense for MT parallelization
+	{
+		pCodingParam->iMultipleThreadIdc	= 1;
+		pCodingParam->iCountThreadsNum				= 1;
+	}
+#endif
+	
+	*pMaxSliceCount					= iMaxSliceCount;
+
+	return 0;
+}
+
+/*!
+ * \brief	log output for cpu features/capabilities
+ */
+void OutputCpuFeaturesLog( uint32_t uiCpuFeatureFlags, uint32_t uiCpuCores, int32_t iCacheLineSize )
+{
+	// welstracer output
+	WelsLog(NULL, WELS_LOG_INFO, "WELS CPU features/capacities (0x%x) detected: \t"	\
+		"HTT:      %c, "	\
+		"MMX:      %c, "	\
+		"MMXEX:    %c, "	\
+		"SSE:      %c, "	\
+		"SSE2:     %c, "	\
+		"SSE3:     %c, "	\
+		"SSSE3:    %c, "	\
+		"SSE4.1:   %c, "	\
+		"SSE4.2:   %c, "	\
+		"AVX:      %c, "	\
+		"FMA:      %c, "	\
+		"X87-FPU:  %c, "	\
+		"3DNOW:    %c, "	\
+		"3DNOWEX:  %c, "	\
+		"ALTIVEC:  %c, "	\
+		"CMOV:     %c, "	\
+		"MOVBE:    %c, "	\
+		"AES:      %c, "	\
+		"NUMBER OF LOGIC PROCESSORS ON CHIP: %d, "	\
+		"CPU CACHE LINE SIZE (BYTES):        %d\n",
+		uiCpuFeatureFlags,
+		(uiCpuFeatureFlags & WELS_CPU_HTT) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_MMX) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_MMXEXT) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE2) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE3) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSSE3) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE41) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE42) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_AVX) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_FMA) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_FPU) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_3DNOW) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_3DNOWEXT) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_ALTIVEC) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_CMOV) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_MOVBE) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
+		uiCpuCores,
+		iCacheLineSize );
+
+#ifdef _DEBUG	// output at console & _debug
+	fprintf( stderr, "WELS CPU features/capacities (0x%x) detected: \n"	\
+		"HTT:      %c, "	\
+		"MMX:      %c, "	\
+		"MMXEX:    %c, "	\
+		"SSE:      %c, "	\
+		"SSE2:     %c, "	\
+		"SSE3:     %c, "	\
+		"SSSE3:    %c, "	\
+		"SSE4.1:   %c, "	\
+		"SSE4.2:   %c, "	\
+		"AVX:      %c, "	\
+		"FMA:      %c, "	\
+		"X87-FPU:  %c, "	\
+		"3DNOW:    %c, "	\
+		"3DNOWEX:  %c, "	\
+		"ALTIVEC:  %c, "	\
+		"CMOV:     %c, "	\
+		"MOVBE:    %c, "	\
+		"AES:      %c, "	\
+		"NUMBER OF LOGIC PROCESSORS ON CHIP: %d, "	\
+		"CPU CACHE LINE SIZE (BYTES):        %d\n",
+		uiCpuFeatureFlags,
+		(uiCpuFeatureFlags & WELS_CPU_HTT) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_MMX) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_MMXEXT) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE2) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE3) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSSE3) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE41) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_SSE42) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_AVX) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_FMA) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_FPU) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_3DNOW) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_3DNOWEXT) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_ALTIVEC) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_CMOV) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_MOVBE) ? 'Y' : 'N',
+		(uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
+		uiCpuCores,
+		iCacheLineSize );
+#endif//_DEBUG
+}
+
+/*!
+ * \brief	initialize Wels avc encoder core library
+ * \pParam	ppCtx		sWelsEncCtx**
+ * \pParam	pParam		SWelsSvcCodingParam*
+ * \return	successful - 0; otherwise none 0 for failed
+ */
+int32_t WelsInitEncoderExt( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pCodingParam )
+{
+	sWelsEncCtx *pCtx		= NULL;
+	int32_t	iRet					= 0;
+	uint32_t uiCpuFeatureFlags		= 0;	// CPU features
+	int32_t uiCpuCores				= 1;	// number of logic processors on physical processor package, one logic processor means HTT not supported	
+	int32_t iCacheLineSize			= 16;	// on chip cache line size in byte
+	int16_t iSliceNum				= 1;	// number of slices used
+		
+ 	if ( NULL == ppCtx || NULL == pCodingParam )
+	{
+		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), NULL == ppCtx(0x%p) or NULL == pCodingParam(0x%p).\n", (void *)ppCtx, (void *)pCodingParam);
+		return 1;
+	}
+
+	iRet	=	ParamValidationExt( pCodingParam );
+	if ( iRet != 0 )
+	{
+		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), ParamValidationExt failed return %d.\n", iRet);
+		return iRet;
+	}
+
+	// for cpu features detection, Only detect once??
+#ifdef X86_ASM
+	uiCpuFeatureFlags	= WelsCPUFeatureDetect( &uiCpuCores );	// detect cpu capacity features	
+	if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_128 )
+		iCacheLineSize = 128;
+	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_64 )
+		iCacheLineSize = 64;
+	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_32 )
+		iCacheLineSize	= 32;
+	else if ( uiCpuFeatureFlags & WELS_CPU_CACHELINE_16 )
+		iCacheLineSize	= 16;
+	OutputCpuFeaturesLog( uiCpuFeatureFlags, uiCpuCores, iCacheLineSize );
+#else
+	iCacheLineSize	= 16;	// 16 bytes aligned in default
+#endif//X86_ASM
+
+#ifndef WELS_TESTBED
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_DETECT_CPU_CORES)
+	if ( pCodingParam->iMultipleThreadIdc > 0 )
+		uiCpuCores = pCodingParam->iMultipleThreadIdc;
+	else
+	{
+		if ( uiCpuFeatureFlags == 0 )	// cpuid not supported, use high level system API as followed to detect number of pysical/logic processor
+			uiCpuCores = DynamicDetectCpuCores();
+		// So far so many cpu cores up to MAX_THREADS_NUM mean for server platforms,
+		// for client application here it is constrained by maximal to MAX_THREADS_NUM
+		if ( uiCpuCores > MAX_THREADS_NUM )	// MAX_THREADS_NUM
+			uiCpuCores	= MAX_THREADS_NUM;	// MAX_THREADS_NUM
+		else if ( uiCpuCores < 1 )	// just for safe
+			uiCpuCores	= 1;
+	}
+#endif//MT_ENABLED && DYNAMIC_DETECT_CPU_CORES
+
+#else//WELS_TESTBED
+	
+	uiCpuCores	= pCodingParam->iMultipleThreadIdc;	// assigned uiCpuCores from iMultipleThreadIdc from SGE testing
+
+#endif//WELS_TESTBED	
+
+	uiCpuCores	= WELS_CLIP3(uiCpuCores, 1, MAX_THREADS_NUM);
+
+	if ( InitSliceSettings(pCodingParam, uiCpuCores, &iSliceNum ) )
+	{
+		WelsLog(NULL, WELS_LOG_ERROR, "WelsInitEncoderExt(), InitSliceSettings failed.\n");
+		return 1;
+	}
+	
+	*ppCtx	= NULL;
+	
+	pCtx	= static_cast<sWelsEncCtx*>(malloc( sizeof(sWelsEncCtx) ));
+
+	WELS_VERIFY_RETURN_IF(1, (NULL == pCtx))
+	memset( pCtx, 0, sizeof(sWelsEncCtx) );	
+
+	pCtx->pMemAlign = new CMemoryAlign( iCacheLineSize );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pCtx->pMemAlign), FreeMemorySvc(&pCtx) )
+
+	// for logs
+#ifdef ENABLE_TRACE_FILE
+	if (wlog == WelsLogDefault)
+	{
+		str_t fname[MAX_FNAME_LEN] = {0};
+
+#if defined (_MSC_VER)
+#if _MSC_VER>=1500
+			SNPRINTF(fname, MAX_FNAME_LEN, MAX_FNAME_LEN, "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
+#else
+			SNPRINTF(fname, MAX_FNAME_LEN, "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
+#endif//_MSC_VER>=1500
+#else
+        //GNUC/
+        SNPRINTF(fname,      MAX_FNAME_LEN,       "%swels_svc_encoder_trace.txt",  pCodingParam->sTracePath );		// confirmed_safe_unsafe_usage
+#endif//_MSC_VER
+
+
+#if defined(__GNUC__)
+		pCtx->pFileLog	= FOPEN(fname, "wt+");
+#else//WIN32
+#if defined(WIN32) && defined(_MSC_VER)
+#if _MSC_VER >= 1500
+		FOPEN(&pCtx->pFileLog,fname, "wt+");
+#else
+		pCtx->pFileLog	= FOPEN(fname, "wt+");
+#endif//_MSC_VER>=1500
+#endif//WIN32 && _MSC_VER
+#endif//__GNUC__
+		pCtx->uiSizeLog	= 0;
+	}
+#endif//ENABLE_TRACE_FILE
+
+	pCodingParam->DetermineTemporalSettings();
+	iRet = AllocCodingParam( &pCtx->pSvcParam, pCtx->pMemAlign, pCodingParam->iNumDependencyLayer );
+	if ( iRet != 0 )
+	{
+		FreeMemorySvc( &pCtx );		
+		return iRet;
+	}
+	memcpy( pCtx->pSvcParam, pCodingParam, sizeof(SWelsSvcCodingParam) );	// confirmed_safe_unsafe_usage
+
+	pCtx->pFuncList = (SWelsFuncPtrList *)pCtx->pMemAlign->WelsMalloc(sizeof(SWelsFuncPtrList), "SWelsFuncPtrList");
+	if ( NULL == pCtx->pFuncList )
+	{
+		FreeMemorySvc( &pCtx );
+		return 1;
+	}
+	InitFunctionPointers( pCtx->pFuncList, pCtx->pSvcParam, uiCpuFeatureFlags );	
+
+	pCtx->iActiveThreadsNum	= pCodingParam->iCountThreadsNum;
+	pCtx->iMaxSliceCount	= iSliceNum;
+	iRet = RequestMemorySvc( &pCtx );
+	if ( iRet != 0 )
+	{		
+		WelsLog(pCtx, WELS_LOG_ERROR, "WelsInitEncoderExt(), RequestMemorySvc failed return %d.\n", iRet);
+		FreeMemorySvc( &pCtx );		
+		return iRet;
+	}
+
+#ifdef MT_ENABLED
+	if ( pCodingParam->iMultipleThreadIdc > 1 )
+		iRet = CreateSliceThreads( pCtx);		
+#endif
+
+	WelsRcInitModule( pCtx,  pCtx->pSvcParam->bEnableRc ? WELS_RC_GOM : WELS_RC_DISABLE);
+
+	pCtx->pVpp = new CWelsPreProcess((void *)pCtx);
+	if ( pCtx->pVpp == NULL )
+	{		
+		WelsLog(pCtx, WELS_LOG_ERROR, "WelsInitEncoderExt(), pOut of memory in case new CWelsPreProcess().\n");
+		FreeMemorySvc( &pCtx );
+		return iRet;
+	}
+
+#if defined(MEMORY_MONITOR)
+	WelsLog(pCtx, WELS_LOG_INFO, "WelsInitEncoderExt() exit, overall memory usage: %lu bytes\n", sizeof(sWelsEncCtx) /* requested size from malloc() or new operator */
+                                                                                                 + pCtx->pMemAlign->WelsGetMemoryUsage()	/* requested size from CMemoryAlign::WelsMalloc() */
+             );
+#endif//MEMORY_MONITOR
+	
+	*ppCtx	= pCtx;
+
+	WelsLog(pCtx, WELS_LOG_DEBUG, "WelsInitEncoderExt(), pCtx= 0x%p.\n", (void *)pCtx);
+	
+	return 0;
+}
+/*
+ *
+ * status information output
+ */
+#if defined(STAT_OUTPUT)
+void StatOverallEncodingExt(sWelsEncCtx *pCtx)
+{
+    int8_t i = 0;
+	int8_t j = 0;
+	for (i = 0;i<pCtx->pSvcParam->iNumDependencyLayer;i++)
+	{
+			fprintf( stdout,"\nDependency layer : %d\n",i);
+			fprintf( stdout,"Quality layer : %d\n",j);
+			{
+				const int32_t iCount = pCtx->sStatData[i][j].sSliceData.iSliceCount[I_SLICE] +
+					                pCtx->sStatData[i][j].sSliceData.iSliceCount[P_SLICE] +
+									pCtx->sStatData[i][j].sSliceData.iSliceCount[B_SLICE];
+#if defined(MB_TYPES_CHECK) 
+				if (iCount > 0){
+					int32_t iCountNumIMb = pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra4x4] + pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra16x16]+ pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][7];
+					int32_t iCountNumPMb	=	pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra4x4] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra16x16] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][7] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][PSkip];	
+					int32_t count_p_mbL0 = 	pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] +
+						pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10];
+					
+					int32_t iMbCount = iCountNumIMb + iCountNumPMb;
+					if ( iMbCount > 0 ){
+						fprintf(	stderr,
+							"SVC: overall Slices	MBs: %d Avg\nI4x4: %.3f%% I16x16: %.3f%% IBL: %.3f%%\nP16x16: %.3f%% P16x8: %.3f%% P8x16: %.3f%% P8x8: %.3f%% SUBP8x8: %.3f%% PSKIP: %.3f%%\nILP(All): %.3f%% ILP(PL0): %.3f%% BLSKIP(PL0): %.3f%% RP(PL0): %.3f%%\n",
+							iMbCount,
+							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra4x4] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra4x4]) / iMbCount),
+							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][Intra16x16] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Intra16x16]) / iMbCount),
+							(100.0f * (pCtx->sStatData[i][j].sSliceData.iMbCount[I_SLICE][7] + pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][7]) / iMbCount),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x16] / iMbCount ),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter16x8] / iMbCount ),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x16] / iMbCount ),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][Inter8x8] / iMbCount),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][10] / iMbCount),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][PSkip] / iMbCount),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][11] / iMbCount),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][11] / count_p_mbL0),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][8] / count_p_mbL0),
+							(100.0f * pCtx->sStatData[i][j].sSliceData.iMbCount[P_SLICE][9] / count_p_mbL0) 
+							);
+					}					
+				}
+#endif //#if defined(MB_TYPES_CHECK)
+
+				if (iCount > 0){
+					fprintf( stdout, "SVC: overall PSNR Y: %2.3f U: %2.3f V: %2.3f kb/s: %.1f fps: %.3f\n\n",
+						(pCtx->sStatData[i][j].sQualityStat.rYPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rYPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rYPsnr[B_SLICE]) / (float)(iCount),
+						(pCtx->sStatData[i][j].sQualityStat.rUPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rUPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rUPsnr[B_SLICE]) / (float)(iCount),
+						(pCtx->sStatData[i][j].sQualityStat.rVPsnr[I_SLICE]+pCtx->sStatData[i][j].sQualityStat.rVPsnr[P_SLICE]+pCtx->sStatData[i][j].sQualityStat.rVPsnr[B_SLICE]) / (float)(iCount),
+						1.0f * pCtx->pSvcParam->sDependencyLayers[i].fOutputFrameRate *(pCtx->sStatData[i][j].sSliceData.iSliceSize[I_SLICE] +pCtx->sStatData[i][j].sSliceData.iSliceSize[P_SLICE] +pCtx->sStatData[i][j].sSliceData.iSliceSize[B_SLICE] ) / (float)(iCount+pCtx->pWelsSvcRc[i].iSkipFrameNum)/1000,
+						1.0f * pCtx->pSvcParam->sDependencyLayers[i].fOutputFrameRate );
+
+				}
+
+			}
+		
+	}
+}
+#endif
+/*!
+ * \brief	uninitialize Wels encoder core library
+ * \pParam	pEncCtx		sWelsEncCtx*
+ * \return	none
+ */
+void WelsUninitEncoderExt( sWelsEncCtx **ppCtx )
+{
+	if ( NULL == ppCtx || NULL == *ppCtx )
+		return;
+
+	WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pCtx= %p, iThreadCount= %d, iMultipleThreadIdc= %d.\n", (void *)(*ppCtx), (*ppCtx)->pSvcParam->iCountThreadsNum, (*ppCtx)->pSvcParam->iMultipleThreadIdc );
+
+#if defined(STAT_OUTPUT)
+	StatOverallEncodingExt( *ppCtx );
+#endif	
+
+#if defined(MT_ENABLED)	
+	if ( (*ppCtx)->pSvcParam->iMultipleThreadIdc > 1 && (*ppCtx)->pSliceThreading != NULL )
+	{		
+		const int32_t iThreadCount = (*ppCtx)->pSvcParam->iCountThreadsNum;
+		int32_t iThreadIdx = 0;
+		
+#if defined(WIN32)
+		if ( (*ppCtx)->pSliceThreading->pExitEncodeEvent != NULL )
+		{
+			do {
+				if ( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] != NULL )	// iThreadIdx is already created successfully
+					WelsEventSignal( &(*ppCtx)->pSliceThreading->pExitEncodeEvent[iThreadIdx] );
+				++ iThreadIdx;
+			} while(iThreadIdx < iThreadCount);
+
+			WelsMultipleEventsWaitAllBlocking( iThreadCount, &(*ppCtx)->pSliceThreading->pFinSliceCodingEvent[0] );
+
+		}		
+#elif defined(__GNUC__)
+		while ( iThreadIdx < iThreadCount )
+		{
+			int res = 0;
+			if ( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] )
+			{
+				res = WelsThreadCancel( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] );
+				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), WelsThreadCancel(pThreadHandles%d) return %d..\n", iThreadIdx, res);
+				res = WelsThreadJoin( (*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] );	// waiting thread exit
+				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pthread_join(pThreadHandles%d) return %d..\n", iThreadIdx, res);
+				(*ppCtx)->pSliceThreading->pThreadHandles[iThreadIdx] = 0;
+			}
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+			if ( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] )
+			{
+				res = WelsThreadCancel( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] );
+				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), WelsThreadCancel(pUpdateMbListThrdHandles%d) return %d..\n", iThreadIdx, res);				
+				res = WelsThreadJoin( (*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] );	// waiting thread exit
+				WelsLog( *ppCtx, WELS_LOG_INFO, "WelsUninitEncoderExt(), pthread_join(pUpdateMbListThrdHandles%d) return %d..\n", iThreadIdx, res);
+				(*ppCtx)->pSliceThreading->pUpdateMbListThrdHandles[iThreadIdx] = 0;
+			}
+#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+			++ iThreadIdx;
+		}
+#endif//WIN32
+	}
+#endif//MT_ENABLED
+
+	if ((*ppCtx)->pVpp)
+	{
+		delete (*ppCtx)->pVpp;
+		(*ppCtx)->pVpp = NULL;
+	}
+	FreeMemorySvc( ppCtx );
+	*ppCtx = NULL;
+}
+
+/*!
+ * \brief	get temporal level due to configuration and coding context	
+ */
+static inline int32_t GetTemporalLevel( SDLayerParam *fDlp, const int32_t kiFrameNum, const int32_t kiGopSize )
+{
+	const int32_t kiCodingIdx	= kiFrameNum & (kiGopSize-1);
+	
+	return fDlp->uiCodingIdx2TemporalId[kiCodingIdx];
+}
+
+void DynslcUpdateMbNeighbourInfoListForAllSlices( SSliceCtx *pSliceCtx, SMB *pMbList )
+{	
+	const int32_t kiMbWidth			= pSliceCtx->iMbWidth;
+	const int32_t kiEndMbInSlice	= pSliceCtx->iMbNumInFrame - 1;
+	int32_t  iIdx					= 0;	
+
+	do {
+		SMB *pMb = &pMbList[iIdx];
+		uint32_t uiNeighborAvailFlag	= 0;
+		const int32_t kiMbXY				= pMb->iMbXY;
+		const int32_t kiMbX				= pMb->iMbX;
+		const int32_t kiMbY				= pMb->iMbY;
+		BOOL_T     bLeft;
+		BOOL_T     bTop;
+		BOOL_T     bLeftTop;
+		BOOL_T     bRightTop;
+		int32_t  uiSliceIdc;
+		int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
+
+		uiSliceIdc = WelsMbToSliceIdc(pSliceCtx, kiMbXY);
+		pMb->uiSliceIdc	= uiSliceIdc;
+		iLeftXY = kiMbXY - 1;
+		iTopXY = kiMbXY - kiMbWidth;
+		iLeftTopXY = iTopXY - 1;
+		iRightTopXY = iTopXY + 1;
+
+		bLeft = (kiMbX > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftXY));
+		bTop = (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iTopXY));
+		bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftTopXY));
+		bRightTop = (kiMbX < (kiMbWidth-1)) && (kiMbY > 0) && (uiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iRightTopXY));		
+
+		if( bLeft ){
+			uiNeighborAvailFlag |= LEFT_MB_POS;
+		}
+		if( bTop ){
+			uiNeighborAvailFlag |= TOP_MB_POS;
+		}
+		if( bLeftTop ){
+			uiNeighborAvailFlag |= TOPLEFT_MB_POS;
+		}
+		if( bRightTop ){
+			uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
+		}		
+		pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
+
+		++ iIdx;
+	} while(iIdx <= kiEndMbInSlice);
+}
+
+/*
+ * TUNE back if number of picture partition decision algorithm based on past if available
+ */
+int32_t PicPartitionNumDecision( sWelsEncCtx *pCtx )
+{
+	int32_t iPartitionNum	= 1;
+#ifdef MT_ENABLED
+	if ( pCtx->pSvcParam->iMultipleThreadIdc > 1 )
+	{
+		iPartitionNum	= pCtx->pSvcParam->iCountThreadsNum;
+#if !defined(FIXED_PARTITION_ASSIGN)
+		if ( P_SLICE == pCtx->eSliceType )
+			iPartitionNum	= 1;
+#endif//!FIXED_PARTITION_ASSIGN
+	}
+	return iPartitionNum;
+#else
+	return iPartitionNum;
+#endif//MT_ENABLED
+}
+
+#if defined(MT_ENABLED)
+void WelsInitCurrentQBLayerMltslc( sWelsEncCtx *pCtx )
+{	
+	//pData init
+	SDqLayer*		pCurDq				= pCtx->pCurDqLayer;
+	SSliceCtx*	pSliceCtx			= (pCurDq->pSliceEncCtx);	
+		
+	//mb_neighbor
+	DynslcUpdateMbNeighbourInfoListForAllSlices( pSliceCtx, pCurDq->sMbDataP );	
+}
+
+void UpdateSlicepEncCtxWithPartition( SSliceCtx *pSliceCtx, int32_t iPartitionNum )
+{
+	const int32_t kiMbNumInFrame	= pSliceCtx->iMbNumInFrame;
+	int32_t iCountMbNumPerPartition	= kiMbNumInFrame;
+	int32_t iAssignableMbLeft		= kiMbNumInFrame;
+	int32_t iFirstMbIdx			= 0;
+	int32_t i/*, j*/;
+
+	if ( iPartitionNum <= 0 )
+		iPartitionNum	= 1;
+	else if ( iPartitionNum > AVERSLICENUM_CONSTRAINT )
+		iPartitionNum	= AVERSLICENUM_CONSTRAINT;	// AVERSLICENUM_CONSTRAINT might be variable, however not fixed by MACRO
+	iCountMbNumPerPartition	/= iPartitionNum;
+	pSliceCtx->iSliceNumInFrame	= iPartitionNum;
+	i = 0;
+	while( i < iPartitionNum )
+	{		
+		if ( i + 1 == iPartitionNum )
+		{
+			pSliceCtx->pCountMbNumInSlice[i]	= iAssignableMbLeft;
+		}
+		else
+		{
+			pSliceCtx->pCountMbNumInSlice[i]	= iCountMbNumPerPartition;
+		}
+		pSliceCtx->pFirstMbInSlice[i]	=	iFirstMbIdx;
+		
+		memset( pSliceCtx->pOverallMbMap+iFirstMbIdx, (uint8_t)i, pSliceCtx->pCountMbNumInSlice[i]*sizeof(uint8_t) );
+
+		// for next partition(or pSlice)
+		iFirstMbIdx	+= pSliceCtx->pCountMbNumInSlice[i];
+		iAssignableMbLeft -= pSliceCtx->pCountMbNumInSlice[i];
+		++ i;
+	}
+}
+
+void WelsInitCurrentDlayerMltslc( sWelsEncCtx *pCtx, int32_t iPartitionNum )
+{	
+	SDqLayer* pCurDq				= pCtx->pCurDqLayer;
+	SSliceCtx* pSliceCtx		= pCurDq->pSliceEncCtx;	
+
+	UpdateSlicepEncCtxWithPartition( pSliceCtx, iPartitionNum );
+
+	if ( I_SLICE == pCtx->eSliceType )//check if uiSliceSizeConstraint too small
+	{
+#define byte_complexIMBat26 (60)
+		uint8_t		iCurDid = pCtx->uiDependencyId;
+		uint32_t	uiFrmByte = 0;
+
+		if ( pCtx->pSvcParam->bEnableRc ) 
+		{//RC case
+			uiFrmByte = (
+				( (uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].iSpatialBitrate)
+				/(uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].fInputFrameRate) ) >> 3 );
+		}
+		else
+		{//fixed QP case
+			const int32_t iTtlMbNumInFrame = pSliceCtx->iMbNumInFrame;
+			int32_t iQDeltaTo26 = ( 26 - pCtx->pSvcParam->sDependencyLayers[iCurDid].iDLayerQp );
+
+			uiFrmByte = (iTtlMbNumInFrame * byte_complexIMBat26);
+			if ( iQDeltaTo26 > 0 )
+			{
+				//smaller QP than 26
+				uiFrmByte = (uint32_t)( uiFrmByte * ( (float)iQDeltaTo26 / 4 ) );
+			}
+			else if ( iQDeltaTo26 < 0 )
+			{
+				//larger QP than 26
+				iQDeltaTo26 = ( (-iQDeltaTo26) >> 2 ); //delta mod 4
+				uiFrmByte = ( uiFrmByte >> (iQDeltaTo26) ); //if delta 4, byte /2
+			}
+		}
+
+		//MINPACKETSIZE_CONSTRAINT
+		if ( pSliceCtx->uiSliceSizeConstraint 
+			<
+			 (uint32_t)( uiFrmByte//suppose 16 byte per mb at average
+			 / ( pSliceCtx->iMaxSliceNumConstraint ) )
+			)
+		{
+
+			WelsLog( pCtx, 
+				WELS_LOG_WARNING, 
+				"Set-SliceConstraint(%d) too small for current resolution (MB# %d) under QP/BR!\n", 
+				pSliceCtx->uiSliceSizeConstraint,
+				pSliceCtx->iMbNumInFrame
+				);
+		}											
+	}
+
+	WelsInitCurrentQBLayerMltslc( pCtx );
+}
+#else
+void WelsInitCurrentQBLayerMltslc( sWelsEncCtx *pCtx )
+{	
+	//pData init
+	SDqLayer*		pCurDq				= pCtx->pCurDqLayer;
+	SSliceCtx*	pSliceCtx			= (pCurDq->pSliceEncCtx);	
+	SSlice *			pSlice				= &pCurDq->sLayerInfo.pSliceInLayer[0];
+	int32_t			iTtlMbNumInFrame = pSliceCtx->iMbNumInFrame;
+
+	//pSliceCtx
+	memset( pSliceCtx->pOverallMbMap,		0, iTtlMbNumInFrame * sizeof(uint8_t) );
+	memset( pSliceCtx->pCountMbNumInSlice,	0, pSliceCtx->iSliceNumInFrame * sizeof(int32_t) );
+	memset( pSliceCtx->pFirstMbInSlice,		0, pSliceCtx->iSliceNumInFrame * sizeof(int16_t) );
+	pSliceCtx->iSliceNumInFrame				= 1;//
+	pSliceCtx->pCountMbNumInSlice[0]			= iTtlMbNumInFrame;
+		
+	//mb_neighbor
+	DynslcUpdateMbNeighbourInfoListForAllSlices( pSliceCtx, pCurDq->sMbDataP );	
+
+	//pSlice init
+	pSlice->uiSliceIdx				= 0;
+	pSlice->pSliceBsa				= &pCtx->pOut->sBsWrite;
+	pSlice->bDynamicSlicingSliceSizeCtrlFlag			= false;
+	pSlice->uiAssumeLog2BytePerMb	= ( pCtx->eSliceType == P_SLICE ) ? 0 : 1;
+}
+
+void WelsInitCurrentDlayerMltslc( sWelsEncCtx *pCtx, int32_t iPartitionNum )
+{	
+	SDqLayer* pCurDq = pCtx->pCurDqLayer;
+	SSliceCtx* pSliceCtx = ( pCurDq->pSliceEncCtx );	
+	int32_t iTtlMbNumInFrame = pCurDq->iMbHeight*pCurDq->iMbWidth;
+
+	pSliceCtx->iMbNumInFrame 
+		= pSliceCtx->pCountMbNumInSlice[0] = iTtlMbNumInFrame;
+
+	if ( I_SLICE == pCtx->eSliceType )//check if uiSliceSizeConstraint too small
+	{
+#define byte_complexIMBat26 (60)
+		uint8_t		iCurDid = pCtx->uiDependencyId;
+		uint32_t	uiFrmByte = 0;
+
+		if ( pCtx->pSvcParam->bEnableRc ) 
+		{//RC case
+			uiFrmByte = (
+				( (uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].iSpatialBitrate)
+				/(uint32_t)(pCtx->pSvcParam->sDependencyLayers[iCurDid].fInputFrameRate) ) >> 3 );
+		}
+		else
+		{//fixed QP case
+			int32_t iQDeltaTo26 = ( 26 - pCtx->pSvcParam->sDependencyLayers[iCurDid].iDLayerQp );
+
+			uiFrmByte = (iTtlMbNumInFrame * byte_complexIMBat26);
+			if ( iQDeltaTo26 > 0 )
+			{
+				//smaller QP than 26
+				uiFrmByte = (uint32_t)( uiFrmByte * ( (float)iQDeltaTo26 / 4 ) );
+			}
+			else if ( iQDeltaTo26 < 0 )
+			{
+				//larger QP than 26
+				iQDeltaTo26 = ( (-iQDeltaTo26) >> 2 ); //delta mod 4
+				uiFrmByte = ( uiFrmByte >> (iQDeltaTo26) ); //if delta 4, byte /2
+			}
+		}
+
+		//MINPACKETSIZE_CONSTRAINT
+		if ( pSliceCtx->uiSliceSizeConstraint 
+			<
+			 (uint32_t)( uiFrmByte//suppose 16 byte per mb at average
+			 / ( pSliceCtx->iMaxSliceNumConstraint ) )
+			)
+		{
+
+			WelsLog( pCtx, 
+				WELS_LOG_WARNING, 
+				"Set-SliceConstraint(%d) too small for current resolution (MB# %d) under QP/BR!\n", 
+				pSliceCtx->uiSliceSizeConstraint,
+				pSliceCtx->iMbNumInFrame
+				);
+		}											
+	}
+
+	WelsInitCurrentQBLayerMltslc( pCtx );
+}
+#endif
+
+/*!
+ * \brief	initialize current layer	
+ */
+void WelsInitCurrentLayer(	sWelsEncCtx *pCtx,
+								const int32_t kiWidth,
+								const int32_t kiHeight )
+{
+ 	SWelsSvcCodingParam *pParam	= pCtx->pSvcParam;
+	SPicture *pEncPic					= pCtx->pEncPic;
+	SPicture *pDecPic					= pCtx->pDecPic;
+	SDqLayer *pCurDq				= pCtx->pCurDqLayer;
+	SSlice *pBaseSlice				= &pCurDq->sLayerInfo.pSliceInLayer[0];
+	SSlice *pSlice					= NULL;
+	const uint8_t kiCurDid			= pCtx->uiDependencyId;
+	const bool_t kbUseSubsetSpsFlag= (kiCurDid > BASE_DEPENDENCY_ID);
+	SDLayerParam *fDlp				= &pParam->sDependencyLayers[kiCurDid];
+	SNalUnitHeaderExt *pNalHdExt	= &pCurDq->sLayerInfo.sNalHeaderExt;
+	SNalUnitHeader *pNalHd			= &pNalHdExt->sNalHeader;	
+	SDqIdc *pDqIdc						= &pCtx->pDqIdcMap[kiCurDid];
+	int32_t iIdx						= 0;
+	int32_t iSliceCount				= 0;
+
+	if ( NULL == pCurDq )
+		return;
+	
+	pCurDq->pDecPic	= pDecPic;
+	
+	if ( fDlp->sMso.uiSliceMode == SM_DYN_SLICE )	// need get extra slices for update
+		iSliceCount = GetInitialSliceNum( pCurDq->iMbWidth, pCurDq->iMbHeight, &fDlp->sMso );
+	else
+		iSliceCount = GetCurrentSliceNum( pCurDq->pSliceEncCtx );
+	assert( iSliceCount > 0 );
+	
+	pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId	= pDqIdc->iPpsId;
+	pCurDq->sLayerInfo.pPpsP							=
+	pBaseSlice->sSliceHeaderExt.sSliceHeader.pPps		= &pCtx->pPPSArray[pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId];	
+	pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId	= pDqIdc->iSpsId;
+	if ( kbUseSubsetSpsFlag )
+	{
+		pCurDq->sLayerInfo.pSubsetSpsP					= &pCtx->pSubsetArray[pDqIdc->iSpsId];
+		pCurDq->sLayerInfo.pSpsP						=
+		pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps	= &pCurDq->sLayerInfo.pSubsetSpsP->pSps;
+	}
+	else
+	{
+		pCurDq->sLayerInfo.pSubsetSpsP					= NULL;
+		pCurDq->sLayerInfo.pSpsP						=
+		pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps	= &pCtx->pSpsArray[pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId];
+	}
+
+	pSlice = pBaseSlice;
+	iIdx = 1;
+	while ( iIdx < iSliceCount ) {
+		++ pSlice;
+		pSlice->sSliceHeaderExt.sSliceHeader.iPpsId	= pBaseSlice->sSliceHeaderExt.sSliceHeader.iPpsId;		
+		pSlice->sSliceHeaderExt.sSliceHeader.pPps	= pBaseSlice->sSliceHeaderExt.sSliceHeader.pPps;		
+		pSlice->sSliceHeaderExt.sSliceHeader.iSpsId	= pBaseSlice->sSliceHeaderExt.sSliceHeader.iSpsId;
+		pSlice->sSliceHeaderExt.sSliceHeader.pSps	= pBaseSlice->sSliceHeaderExt.sSliceHeader.pSps;		
+		++ iIdx;		
+	}
+
+	memset( pNalHdExt, 0, sizeof(SNalUnitHeaderExt) );
+	pNalHd->uiNalRefIdc					= pCtx->eNalPriority;
+	pNalHd->eNalUnitType				= pCtx->eNalType;
+
+	pNalHdExt->uiDependencyId			= kiCurDid;
+	pNalHdExt->bDiscardableFlag		= (pCtx->bNeedPrefixNalFlag) ? (pNalHd->uiNalRefIdc == NRI_PRI_LOWEST) : false;
+	pNalHdExt->bIdrFlag				= (pCtx->iFrameNum == 0) && ((pCtx->eNalType == NAL_UNIT_CODED_SLICE_IDR) || (pCtx->eSliceType == I_SLICE));
+	pNalHdExt->uiTemporalId				= pCtx->uiTemporalId;
+	
+	pBaseSlice->bSliceHeaderExtFlag	= (NAL_UNIT_CODED_SLICE_EXT == pNalHd->eNalUnitType);
+	
+	pSlice = pBaseSlice;
+	iIdx = 1;
+	while (iIdx < iSliceCount) {
+		++ pSlice;		
+		pSlice->bSliceHeaderExtFlag			= pBaseSlice->bSliceHeaderExtFlag;
+		++ iIdx;		
+	}	
+
+	// pEncPic pData
+	pCurDq->pEncData[0]		= pEncPic->pData[0];
+	pCurDq->pEncData[1]		= pEncPic->pData[1];
+	pCurDq->pEncData[2]		= pEncPic->pData[2];
+	pCurDq->iEncStride[0]	= pEncPic->iLineSize[0];
+	pCurDq->iEncStride[1]	= pEncPic->iLineSize[1];
+	pCurDq->iEncStride[2]	= pEncPic->iLineSize[2];
+	// cs pData
+	pCurDq->pCsData[0]		= pDecPic->pData[0];
+	pCurDq->pCsData[1]		= pDecPic->pData[1];
+	pCurDq->pCsData[2]		= pDecPic->pData[2];
+	pCurDq->iCsStride[0]	= pDecPic->iLineSize[0];
+	pCurDq->iCsStride[1]	= pDecPic->iLineSize[1];
+	pCurDq->iCsStride[2]	= pDecPic->iLineSize[2];		
+	
+	if ( pCurDq->pRefLayer != NULL )
+	{
+		pCurDq->bBaseLayerAvailableFlag	= true;
+	}
+	else
+	{
+		pCurDq->bBaseLayerAvailableFlag	= false;
+	}
+}
+
+void PreprocessSliceCoding( sWelsEncCtx *pCtx )
+{
+	SDqLayer *pCurLayer		= pCtx->pCurDqLayer;
+	const bool_t kbBaseAvail	= pCurLayer->bBaseLayerAvailableFlag;
+
+	/* function pointers conditional assignment under sWelsEncCtx, layer_mb_enc_rec (in stack) is exclusive */
+
+	if ( P_SLICE == pCtx->eSliceType )
+	{ 
+		if ( kbBaseAvail ) 
+		{			
+			if ( pCtx->pSvcParam->iNumDependencyLayer == (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1) ) //
+			{
+				pCtx->pFuncList->pfMotionSearch = WelsMotionEstimateSearchSad;				
+				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
+				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
+				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartitionVaa;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
+				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;				
+			}
+			else 
+			{
+				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSatd;
+				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
+				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
+				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartition;
+				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;				
+			}
+			pCtx->pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;	
+		}
+		else
+		{
+			//case 3: pBase layer MD + encoding
+			if ( pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId+1 == pCtx->pSvcParam->iNumDependencyLayer )
+			{
+				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSad;
+				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
+				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
+				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartitionVaa;
+				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;				
+			}
+			else
+			{
+   				pCtx->pFuncList->pfMotionSearch  = WelsMotionEstimateSearchSatd;
+				pCtx->pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
+				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
+				pCtx->pFuncList->pfInterFineMd = WelsMdInterFinePartition;
+				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;				
+			}
+			pCtx->pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;			
+		}
+	}
+	else if ( I_SLICE == pCtx->eSliceType )
+	{
+			if ( pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId+1 == pCtx->pSvcParam->iNumDependencyLayer )
+			{
+				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSad;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad;
+				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartitionVaa;
+			}
+			else
+			{
+				pCtx->pFuncList->sSampleDealingFuncs.pfMdCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd;
+				pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3 = pCtx->pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd;
+				pCtx->pFuncList->pfIntraFineMd = WelsMdIntraFinePartition;
+			}			
+	}
+}
+
+/*!
+ * \brief	swap pDq layers between current pDq layer and reference pDq layer	
+ */
+
+static inline void WelsSwapDqLayers( sWelsEncCtx *pCtx )
+{
+	// swap and assign reference	
+	const int32_t kiDid			= pCtx->uiDependencyId;
+	const int32_t kiNextDqIdx   = 1 + kiDid;
+
+	SDqLayer *pTmpLayer			= pCtx->ppDqLayerList[kiNextDqIdx];
+	SDqLayer *pRefLayer			= pCtx->pCurDqLayer;
+	pCtx->pCurDqLayer				= pTmpLayer;
+	pCtx->pCurDqLayer->pRefLayer	= pRefLayer;
+}
+
+/*!
+ * \brief	prefetch reference picture after WelsBuildRefList	
+ */
+static inline void PrefetchReferencePicture( sWelsEncCtx *pCtx, const EFrameType keFrameType )
+{	
+	SSlice *pSliceBase = &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[0];
+	const int32_t kiSliceCount = GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
+	int32_t iIdx = 0;
+	uint8_t uiRefIdx = -1;
+
+	assert( kiSliceCount > 0 );
+	if ( keFrameType != WELS_FRAME_TYPE_IDR )
+	{
+		assert( pCtx->iNumRef0 > 0 );
+		pCtx->pRefPic	= pCtx->pRefList0[0];	// always get item 0 due to reordering done
+		pCtx->pCurDqLayer->pRefPic	= pCtx->pRefPic;
+		uiRefIdx	= 0;	// reordered reference iIndex		
+	}
+	else	// safe for IDR coding
+	{
+		pCtx->pRefPic					= NULL;
+		pCtx->pCurDqLayer->pRefPic	= NULL;		
+	}
+
+	iIdx = 0;
+	while (iIdx < kiSliceCount) {
+		pSliceBase->sSliceHeaderExt.sSliceHeader.uiRefIndex	= uiRefIdx;
+		++ pSliceBase;
+		++ iIdx;
+	}
+}
+
+
+void ParasetIdAdditionIdAdjust( SParaSetOffsetVariable *sParaSetOffsetVariable, const int32_t kiCurEncoderParaSetId, const uint32_t kuiMaxIdInBs )//paraset_type = 0: SPS; =1: PPS
+{
+	//SPS_ID in avc_sps and pSubsetSps will be different using this
+	//SPS_ID case example:
+	//1st enter:		next_spsid_in_bs == 0; spsid == 0; delta==0;				//actual spsid_in_bs == 0 
+	//1st finish:		next_spsid_in_bs == 1;
+	//2nd enter:	next_spsid_in_bs == 1; spsid == 0; delta==1;				//actual spsid_in_bs == 1
+	//2nd finish:		next_spsid_in_bs == 2;
+	//31st enter:	next_spsid_in_bs == 31; spsid == 0~2; delta==31~29;	//actual spsid_in_bs == 31
+	//31st finish:	next_spsid_in_bs == 0;
+	//31st enter:	next_spsid_in_bs == 0; spsid == 0~2; delta==-2~0;		//actual spsid_in_bs == 0
+	//31st finish:	next_spsid_in_bs == 1;
+	
+	const int32_t kiEncId			= kiCurEncoderParaSetId;
+	const uint32_t kuiPrevIdInBs	= sParaSetOffsetVariable->iParaSetIdDelta[kiEncId] + kiEncId;//mark current_id
+	const bool_t *kpUsedIdPointer   = &sParaSetOffsetVariable->bUsedParaSetIdInBs[0];
+	uint32_t uiNextIdInBs			= sParaSetOffsetVariable->uiNextParaSetIdToUseInBs;
+
+#if _DEBUG
+	if ( 0 != sParaSetOffsetVariable->iParaSetIdDelta[kiEncId] )
+		assert ( sParaSetOffsetVariable->bUsedParaSetIdInBs[kuiPrevIdInBs] ); //sure the prev-used one was marked activated correctly
+#endif
+	//update current layer's pCodingParam
+	sParaSetOffsetVariable->iParaSetIdDelta[kiEncId]	= uiNextIdInBs - kiEncId;  //for current parameter set, change its id_delta
+	//write pso pData for next update: 
+	sParaSetOffsetVariable->bUsedParaSetIdInBs[kuiPrevIdInBs] = false;	//   
+	sParaSetOffsetVariable->bUsedParaSetIdInBs[uiNextIdInBs] = true;		//   update current used_id
+
+	//prepare for next update:
+	//   find the next avaibable iId
+	do
+	{
+		++uiNextIdInBs;
+		if (uiNextIdInBs >= kuiMaxIdInBs ) 
+		{
+			uiNextIdInBs = 0;//ensure the SPS_ID wound not exceed MAX_SPS_COUNT
+		}
+	}while ( kpUsedIdPointer[uiNextIdInBs] );
+
+	//   update next_id
+	sParaSetOffsetVariable->uiNextParaSetIdToUseInBs = uiNextIdInBs;
+
+#if _DEBUG
+	assert ( !sParaSetOffsetVariable->bUsedParaSetIdInBs[uiNextIdInBs] ); //sure the next-to-use one is marked activated correctly
+#endif
+
+}
+
+/*!
+ * \brief	write all parameter sets introduced in SVC extension
+ * \return	size in bytes of bitstream wrote
+ */
+int32_t WelsWriteParameterSets( sWelsEncCtx *pCtx, int32_t *pNalLen, int32_t *pNumNal )
+{
+	int32_t iSize	= 0;
+	int32_t iNal	= 0;
+	int32_t	iIdx	= 0;
+	int32_t iId	= 0;
+	int32_t iCountNal	= 0;
+
+	if ( NULL == pCtx || NULL == pNalLen || NULL == pNumNal )
+		return 0;	
+		
+	/* write all SPS */
+	iIdx = 0;
+	while (iIdx < pCtx->iSpsNum) {
+		SDqIdc *pDqIdc		= &pCtx->pDqIdcMap[iIdx];
+		const int32_t kiDid	= pDqIdc->uiSpatialId;
+		const bool_t kbUsingSubsetSps = (kiDid > BASE_DEPENDENCY_ID);
+
+		iNal	= pCtx->pOut->iNalIndex;
+
+		if ( pCtx->pSvcParam->bEnableSpsPpsIdAddition )
+		{
+#if _DEBUG
+			pCtx->sPSOVector.bEnableSpsPpsIdAddition = 1;
+			assert(kiDid < MAX_DEPENDENCY_LAYER);
+			assert(iIdx < MAX_DQ_LAYER_NUM);
+#endif
+
+			ParasetIdAdditionIdAdjust( &(pCtx->sPSOVector.sParaSetOffsetVariable[kbUsingSubsetSps ? PARA_SET_TYPE_SUBSETSPS : PARA_SET_TYPE_AVCSPS]), 
+				(kbUsingSubsetSps)?(pCtx->pSubsetArray[iIdx - 1].pSps.uiSpsId):(pCtx->pSpsArray[0].uiSpsId ), 
+				MAX_SPS_COUNT );
+		}
+		else
+		{
+			memset(&(pCtx->sPSOVector), 0, sizeof(pCtx->sPSOVector)  );
+		}
+
+		if ( kbUsingSubsetSps ){
+			iId	= iIdx - 1;
+			
+			/* generate Subset SPS */
+			WelsLoadNal( pCtx->pOut, NAL_UNIT_SUBSET_SPS, NRI_PRI_HIGHEST );
+
+			WelsWriteSubsetSpsSyntax( &pCtx->pSubsetArray[iId], &pCtx->pOut->sBsWrite, &(pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_SUBSETSPS].iParaSetIdDelta[0]) );
+			WelsUnloadNal( pCtx->pOut );
+		}
+		else{
+			iId	= 0;
+			
+			/* generate sequence parameters set */
+			WelsLoadNal( pCtx->pOut, NAL_UNIT_SPS, NRI_PRI_HIGHEST );
+			WelsWriteSpsNal( &pCtx->pSpsArray[0], &pCtx->pOut->sBsWrite,  &(pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_AVCSPS].iParaSetIdDelta[0]) );
+			WelsUnloadNal( pCtx->pOut );
+		}
+		
+		pNalLen[iCountNal] = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &pNalLen[iCountNal] );
+
+		pCtx->iPosBsBuffer	+= pNalLen[iCountNal];
+		iSize				+= pNalLen[iCountNal];
+		
+		++ iIdx;
+		++ iCountNal;
+	}	
+	
+	/* write all PPS */
+	iIdx = 0;
+	while (iIdx < pCtx->iPpsNum) {
+		if ( pCtx->pSvcParam->bEnableSpsPpsIdAddition )
+		{
+			//para_set_type = 2: PPS, use MAX_PPS_COUNT
+			ParasetIdAdditionIdAdjust( &pCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS], pCtx->pPPSArray[iIdx].iPpsId, MAX_PPS_COUNT );
+		}
+
+		iNal	= pCtx->pOut->iNalIndex;
+		/* generate picture parameter set */
+		WelsLoadNal( pCtx->pOut, NAL_UNIT_PPS, NRI_PRI_HIGHEST );
+		WelsWritePpsSyntax( &pCtx->pPPSArray[iIdx], &pCtx->pOut->sBsWrite, &(pCtx->sPSOVector) );
+		WelsUnloadNal( pCtx->pOut );
+		
+		pNalLen[iCountNal] = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &pNalLen[iCountNal] );
+		
+		pCtx->iPosBsBuffer	+= pNalLen[iCountNal];
+		iSize				+= pNalLen[iCountNal];
+		
+		++ iIdx;
+		++ iCountNal;
+	}
+	
+	*pNumNal = iCountNal;
+	
+	return iSize;
+}
+
+static inline int32_t AddPrefixNal(	sWelsEncCtx *pCtx,
+									 SLayerBSInfo *pLayerBsInfo,
+									 int32_t *pNalLen,
+									 int32_t *pNalIdxInLayer,
+									 const EWelsNalUnitType keNalType,
+									 const EWelsNalRefIdc keNalRefIdc	)
+{
+	int32_t iPayloadSize = 0;
+	
+	if ( keNalRefIdc != NRI_PRI_LOWEST )
+	{
+		WelsLoadNal( pCtx->pOut, NAL_UNIT_PREFIX, keNalRefIdc );		
+
+		WelsWriteSVCPrefixNal( &pCtx->pOut->sBsWrite, keNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == keNalType) );
+
+		WelsUnloadNal( pCtx->pOut );						
+		
+		iPayloadSize	= WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
+			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+			pCtx->pFrameBs + pCtx->iPosBsBuffer,
+			&pNalLen[*pNalIdxInLayer]	);	
+		
+		pCtx->iPosBsBuffer							+= iPayloadSize;
+		pLayerBsInfo->iNalLengthInByte[*pNalIdxInLayer]	= iPayloadSize;
+		
+		(*pNalIdxInLayer) ++;
+	}
+	else // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
+	{
+		WelsLoadNal( pCtx->pOut, NAL_UNIT_PREFIX, keNalRefIdc );
+		// No need write any syntax of prefix NAL Unit RBSP here
+		WelsUnloadNal( pCtx->pOut );
+		
+		iPayloadSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
+			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+			pCtx->pFrameBs + pCtx->iPosBsBuffer,
+			&pNalLen[*pNalIdxInLayer]	);
+		
+		pCtx->iPosBsBuffer							+= iPayloadSize;
+		pLayerBsInfo->iNalLengthInByte[*pNalIdxInLayer]	= iPayloadSize;
+		
+		(*pNalIdxInLayer) ++;
+	}
+	
+	return iPayloadSize;
+}
+
+int32_t WritePadding(sWelsEncCtx *pCtx, int32_t iLen)
+{
+	int32_t i=0;
+	int32_t iNal	= 0;
+	SBitStringAux	*pBs = NULL;	
+	int32_t iNalLen;
+	int32_t iSize=0;
+	
+	iNal	= pCtx->pOut->iNalIndex;
+	pBs	=	&pCtx->pOut->sBsWrite;	// SBitStringAux instance for non VCL NALs decoding
+	
+	if((pBs->pBufEnd - pBs->pBufPtr) < iLen || iNal >= pCtx->pOut->iCountNals)
+	{
+#if GOM_TRACE_FLAG
+		WelsLog( pCtx, WELS_LOG_ERROR,"[RC] paddingcal pBuffer overflow, bufferlen=%d, paddinglen=%d, iNalIdx= %d, iCountNals= %d\n",
+			(pBs->pBufEnd-pBs->pBufPtr), iLen, iNal, pCtx->pOut->iCountNals);
+#endif
+		return 0;
+	}
+
+	WelsLoadNal( pCtx->pOut, NAL_UNIT_FILLER_DATA, NRI_PRI_LOWEST );
+	
+	for(i=0;i<iLen;i++)
+	{
+		BsWriteBits( pBs, 8, 0xff);
+	}
+	
+	BsRbspTrailingBits( pBs );
+
+	BsFlush( pBs );
+	
+	WelsUnloadNal( pCtx->pOut );
+	iNalLen = WelsEncodeNal( &pCtx->pOut->sNalList[iNal], pCtx->pFrameBs + pCtx->iPosBsBuffer, &iNalLen );
+	
+	pCtx->iPosBsBuffer	+= iNalLen;
+	iSize				+= iNalLen;
+	
+	return iSize;
+}
+
+/*
+ * post process of dynamic slicing bs writing in case PACKING_ONE_SLICE_PER_LAYER
+ * include: count bs size of over all the slices in layer, 
+ * return: count number of slices in layer
+ */
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+int32_t PostProcDynamicSlicingBsWriting( sWelsEncCtx *pCtx, SLayerBSInfo *pLayerBsInfo, int32_t *pLayerSize, const int32_t kiPartitionCnt )
+{
+	SDqLayer *pCurDq		= pCtx->pCurDqLayer;
+	int32_t iPartitionIdx	= 0;
+	int32_t iCheckingIdx	= 0;
+	int32_t iSwappingIdx	= -1;
+	int32_t iSliceCount		= 0;
+	int32_t iLayerSize		= 0;
+
+	// count number of slices in layer and layer size
+	while(iPartitionIdx < kiPartitionCnt)
+	{
+		const int32_t coded_slice_cnt = pCurDq->pNumSliceCodedOfPartition[iPartitionIdx];		
+		iLayerSize += pCtx->pSliceThreading->pCountBsSizeInPartition[iPartitionIdx];
+		iSliceCount += coded_slice_cnt;
+		++ iPartitionIdx;
+	}
+	*pLayerSize	= iLayerSize;
+
+	// reordering pLayerBs pointers, but do not ensure raster scan order of picture
+	// just maintain discontinuous items,i.e,
+	// input:
+	// partition 1: uiSliceIdx: 0 2 4 6
+	// partition 2: uiSliceIdx: 1 3 5 7 9 11 13
+	// output:
+	// uiSliceIdx: 0 1 2 3 4 5 6 7 8 9 10
+	iCheckingIdx = 0;						
+	while(true)
+	{
+		bool_t bMatchFlag = false;
+		iPartitionIdx = 0;							
+		while(iPartitionIdx < kiPartitionCnt)
+		{
+			const int32_t coded_slice_cnt = pCurDq->pNumSliceCodedOfPartition[iPartitionIdx];
+			// iCheckingIdx need convert to iIndex of iPartitionIdx based to avoid linear searching
+			// belong this partition and not exceed the number of slices coded in partition
+			if ( iPartitionIdx == (iCheckingIdx % kiPartitionCnt)
+				&& iCheckingIdx / kiPartitionCnt < coded_slice_cnt )
+			{
+				if ( iSwappingIdx >= 0 )
+				{
+					// memory swapping
+					memmove(pLayerBsInfo+iSwappingIdx, LayerBsInfo+iCheckingIdx, sizeof(SLayerBSInfo));	// confirmed_safe_unsafe_usage
+					++ iSwappingIdx;	// record iSwappingIdx
+				}
+				++ iCheckingIdx;
+				bMatchFlag = true;
+				break;
+			}
+			++ iPartitionIdx;
+		}
+		if ( !bMatchFlag )
+		{
+			if ( iSwappingIdx < 0 )
+				iSwappingIdx = iCheckingIdx;
+			++ iCheckingIdx;
+		}
+		if ( iSwappingIdx >= iSliceCount )
+			break;
+	}
+
+	return iSliceCount;
+}
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+
+/*
+ * Force coding IDR as follows
+ */
+int32_t ForceCodingIDR( sWelsEncCtx *pCtx )
+{
+	if ( NULL == pCtx )
+		return 1;
+
+	pCtx->bEncCurFrmAsIdrFlag = true;
+	pCtx->iCodingIndex	= 0;
+
+	return 0;
+}
+
+/*!
+ * \brief	core svc encoding process
+ *
+ * \pParam	pCtx			sWelsEncCtx*, encoder context
+ * \pParam	pDst			FrameBSInfo*
+ * \pParam	pSrc			SSourcePicture* for need_ds = true or SSourcePicture** for need_ds = false
+ * \pParam	iConfiguredLayerNum	=1 in case need_ds = true or >1 in case need_ds = false
+ * \pParam	need_ds		Indicate whether need down sampling desired
+ *						[NO in picture list case, YES in console aplication based]
+ * \return	EFrameType (WELS_FRAME_TYPE_IDR/WELS_FRAME_TYPE_I/WELS_FRAME_TYPE_P)
+ */
+int32_t WelsEncoderEncodeExt( sWelsEncCtx *pCtx, void *pDst, const SSourcePicture **ppSrcList, const int32_t iConfiguredLayerNum )
+{
+	SFrameBSInfo *pFbi					= (SFrameBSInfo *)pDst;
+	SLayerBSInfo *pLayerBsInfo					= &pFbi->sLayerInfo[0];
+	SWelsSvcCodingParam *pSvcParam	= pCtx->pSvcParam;
+	SSpatialPicIndex *pSpatialIndexMap= &pCtx->sSpatialIndexMap[0];
+#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
+	SPicture *fsnr						= NULL;
+#endif//ENABLE_FRAME_DUMP || ENABLE_PSNR_CALC
+	SPicture *pEncPic						= NULL;	// to be decided later
+#if defined(MT_ENABLED) && (defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG))
+	int32_t did_list[MAX_DEPENDENCY_LAYER]	= {0};	
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
+	int32_t iLayerNum					= 0;
+	int32_t iLayerSize					= 0;
+	int32_t iSpatialNum					= 0; // available count number of spatial layers due to frame size changed in this given frame
+	int32_t iSpatialIdx					= 0; // iIndex of spatial layers due to frame size changed in this given frame
+	int32_t iFrameSize					= 0;
+	int32_t iNalLen[128]				= {0};
+	int32_t iNalIdxInLayer			= 0;
+	int32_t iCountNal					= 0;
+	EFrameType eFrameType				= WELS_FRAME_TYPE_AUTO;	
+	int32_t iCurWidth					= 0;
+	int32_t iCurHeight					= 0;
+	EWelsNalUnitType eNalType			= NAL_UNIT_UNSPEC_0;
+	EWelsNalRefIdc eNalRefIdc			= NRI_PRI_LOWEST;
+	int8_t iCurDid						= 0;
+	int8_t iCurTid						= 0;
+	bool_t bAvcBased					= false;
+#if defined(ENABLE_PSNR_CALC)
+	real32_t snr_y = .0f, snr_u = .0f, snr_v = .0f;
+#endif//ENABLE_PSNR_CALC
+
+#if defined(_DEBUG)
+	int32_t i = 0, j = 0, k = 0;
+#endif//_DEBUG
+
+	pFbi->iLayerNum	= 0;	// for initialization
+
+	// perform csc/denoise/downsample/padding, generate spatial layers
+	iSpatialNum = pCtx->pVpp->WelsPreprocessStep1(pCtx, ppSrcList, iConfiguredLayerNum);	
+	if ( iSpatialNum < 1 )	// skip due to temporal layer settings (different frame rate)
+	{
+		++ pCtx->iCodingIndex;
+		return WELS_FRAME_TYPE_SKIP;
+	}
+
+	eFrameType = DecideFrameType( pCtx, iSpatialNum );
+	if (eFrameType == WELS_FRAME_TYPE_SKIP)
+		return eFrameType;
+
+	InitFrameCoding( pCtx, eFrameType );
+
+	iCurTid	= GetTemporalLevel( &pSvcParam->sDependencyLayers[pSpatialIndexMap->iDid], pCtx->iCodingIndex, pSvcParam->uiGopSize );
+	pCtx->uiTemporalId	= iCurTid;
+	
+	pLayerBsInfo->pBsBuf	= pCtx->pFrameBs ;
+
+	if ( eFrameType == WELS_FRAME_TYPE_IDR  )
+	{
+		++ pCtx->sPSOVector.uiIdrPicId;
+		//if ( pSvcParam->bEnableSSEI )
+		
+		// write parameter sets bitstream here
+		WelsWriteParameterSets( pCtx, &iNalLen[0], &iCountNal );
+
+		pLayerBsInfo->uiPriorityId	= 0;
+		pLayerBsInfo->uiSpatialId		= 0;
+		pLayerBsInfo->uiTemporalId	= 0;
+		pLayerBsInfo->uiQualityId		= 0;
+		pLayerBsInfo->uiLayerType		= NON_VIDEO_CODING_LAYER;
+		pLayerBsInfo->iNalCount		= iCountNal;
+		for (int32_t iNalIndex	= 0; iNalIndex < iCountNal; ++ iNalIndex)
+		{
+			pLayerBsInfo->iNalLengthInByte[iNalIndex]	= iNalLen[iNalIndex];
+		}
+
+		++ pLayerBsInfo;
+		pLayerBsInfo->pBsBuf			= pCtx->pFrameBs + pCtx->iPosBsBuffer;
+		++ iLayerNum;
+	}
+
+	pCtx->pCurDqLayer				= pCtx->ppDqLayerList[pSpatialIndexMap->iDid];
+	pCtx->pCurDqLayer->pRefLayer	= NULL;
+
+	while ( iSpatialIdx < iSpatialNum )
+	{		
+		const int32_t d_idx			= (pSpatialIndexMap+iSpatialIdx)->iDid;	// get iDid
+		SDLayerParam *param_d		= &pSvcParam->sDependencyLayers[d_idx];			
+
+		pCtx->uiDependencyId	= iCurDid = (int8_t)d_idx;
+		pCtx->pVpp->WelsPreprocessStep3(pCtx, d_idx);
+
+		pCtx->pEncPic	 = pEncPic = (pSpatialIndexMap+iSpatialIdx)->pSrc;
+		pCtx->pEncPic->iPictureType	= pCtx->eSliceType;
+		pCtx->pEncPic->iFramePoc		= pCtx->iPOC;
+
+		iCurWidth	= param_d->iFrameWidth;
+		iCurHeight	= param_d->iFrameHeight;
+
+#if defined(MT_ENABLED) && (defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG))
+		did_list[iSpatialIdx]	= iCurDid;
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
+		
+		// Encoding this picture might mulitiple sQualityStat layers potentially be encoded as followed
+
+		switch ( param_d->sMso.uiSliceMode )
+		{
+		case SM_FIXEDSLCNUM_SLICE:
+			{
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)	
+				if ( (iCurDid > 0) && (pSvcParam->iMultipleThreadIdc > 1) &&
+					(pSvcParam->sDependencyLayers[iCurDid].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[iCurDid].sMso.sSliceArgument.iSliceNum )
+					) 
+					AdjustEnhanceLayer( pCtx, iCurDid );
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN
+				break;
+			}
+		case SM_DYN_SLICE:
+			{
+				int32_t iPicIPartitionNum = PicPartitionNumDecision( pCtx );
+				// MT compatibility
+				pCtx->iActiveThreadsNum	= iPicIPartitionNum;	// we try to active number of threads, equal to number of picture partitions
+				WelsInitCurrentDlayerMltslc( pCtx, iPicIPartitionNum );
+				break;
+			}
+		default:
+			{
+				break;
+			}
+		}
+
+		/* coding each spatial layer, only one sQualityStat layer within spatial support */
+		int32_t iSliceCount	= 1;			
+		if ( iLayerNum >= MAX_LAYER_NUM_OF_FRAME )	// check available layer_bs_info writing as follows
+		{
+			WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iLayerNum(%d) overflow(max:%d)!", iLayerNum, MAX_LAYER_NUM_OF_FRAME);
+			return -1;
+		}
+
+		iNalIdxInLayer	= 0;
+		bAvcBased	= (iCurDid == BASE_DEPENDENCY_ID);
+		pCtx->bNeedPrefixNalFlag	= (bAvcBased && 
+			(pSvcParam->bPrefixNalAddingCtrl || 
+			(pSvcParam->iNumDependencyLayer > 1) ));
+
+		if ( eFrameType == WELS_FRAME_TYPE_P )
+		{
+			eNalType	= bAvcBased ? NAL_UNIT_CODED_SLICE : NAL_UNIT_CODED_SLICE_EXT;					
+		}
+		else if ( eFrameType == WELS_FRAME_TYPE_IDR )
+		{
+			eNalType	= bAvcBased ? NAL_UNIT_CODED_SLICE_IDR : NAL_UNIT_CODED_SLICE_EXT;
+		}
+		if ( iCurTid == 0 || pCtx->eSliceType == I_SLICE )
+			eNalRefIdc	= NRI_PRI_HIGHEST;
+		else if ( iCurTid == pSvcParam->iDecompStages )
+			eNalRefIdc	= NRI_PRI_LOWEST;
+		else if ( 1 + iCurTid == pSvcParam->iDecompStages )
+			eNalRefIdc	= NRI_PRI_LOW;
+		else	// more details for other temporal layers?
+			eNalRefIdc	= NRI_PRI_HIGHEST;
+		pCtx->eNalType		= eNalType;
+		pCtx->eNalPriority	= eNalRefIdc;				
+
+		pCtx->pDecPic					= pCtx->ppRefPicListExt[iCurDid]->pNextBuffer;
+#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
+		fsnr					= pCtx->pDecPic;
+#endif//#if defined(ENABLE_FRAME_DUMP) || defined(ENABLE_PSNR_CALC)
+		pCtx->pDecPic->iPictureType	= pCtx->eSliceType;				
+		pCtx->pDecPic->iFramePoc		= pCtx->iPOC;				
+
+		WelsInitCurrentLayer( pCtx, iCurWidth, iCurHeight );
+
+		WelsMarkPic(pCtx);
+		if ( !WelsBuildRefList( pCtx, pCtx->iPOC ) )
+		{
+			// Force coding IDR as followed
+			ForceCodingIDR( pCtx );
+			WelsLog(pCtx, WELS_LOG_WARNING, "WelsEncoderEncodeExt(), WelsBuildRefList failed for P frames, pCtx->iNumRef0= %d.\n", pCtx->iNumRef0);
+			return -1;
+		}
+#ifdef LONG_TERM_REF_DUMP
+		dump_ref(pCtx);
+#endif
+		WelsUpdateRefSyntax(pCtx,  pCtx->iPOC, eFrameType);	//get reordering syntax used for writing slice header and transmit to encoder.
+		PrefetchReferencePicture( pCtx, eFrameType );	// update reference picture for current pDq layer
+
+		pCtx->pFuncList->pfRc.pfWelsRcPictureInit(pCtx);
+		PreprocessSliceCoding( pCtx );	// MUST be called after pfWelsRcPictureInit() and WelsInitCurrentLayer()
+
+		iLayerSize	= 0;
+		if ( SM_SINGLE_SLICE == param_d->sMso.uiSliceMode )	// only one slice within a sQualityStat layer
+		{
+			int32_t iSliceSize = 0;					
+			
+			if ( pCtx->bNeedPrefixNalFlag )
+			{
+				iLayerSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, eNalType, eNalRefIdc );
+			}
+			
+			WelsLoadNal( pCtx->pOut, eNalType, eNalRefIdc );
+			
+			WelsCodeOneSlice( pCtx, 0, eNalType );
+			
+			WelsUnloadNal( pCtx->pOut );
+			
+			iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
+											&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+											pCtx->pFrameBs + pCtx->iPosBsBuffer,
+											&iNalLen[iNalIdxInLayer] );
+			iLayerSize += iSliceSize;
+			pCtx->iPosBsBuffer	+= iSliceSize;
+			pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
+			pLayerBsInfo->uiSpatialId		= iCurDid;
+			pLayerBsInfo->uiTemporalId	= iCurTid;
+			pLayerBsInfo->uiQualityId		= 0;
+			pLayerBsInfo->uiPriorityId	= 0;
+			pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;
+			pLayerBsInfo->iNalCount		= ++ iNalIdxInLayer;					
+		}
+		// for dynamic slicing single threading..
+#ifndef MT_ENABLED
+		else if ( SM_DYN_SLICE == param_d->sMso.uiSliceMode )
+#else	// MT_ENABLED
+		else if ( (SM_DYN_SLICE == param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc <= 1) )
+#endif//MT_ENABLED
+		{
+			const int32_t kiLastMbInFrame = pCtx->pCurDqLayer->pSliceEncCtx->iMbNumInFrame;
+			WelsCodeOnePicPartition( pCtx, pLayerBsInfo, &iNalIdxInLayer, &iLayerSize, 0, kiLastMbInFrame, 0 );					
+		}
+		else
+		{//other multi-slice uiSliceMode			
+#if defined(MT_ENABLED)
+            int err = 0;
+			// THREAD_FULLY_FIRE_MODE/THREAD_PICK_UP_MODE for any mode of non-SM_DYN_SLICE
+			if ( (SM_DYN_SLICE != param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc > 1) )
+			{
+				iSliceCount	= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
+				if ( iLayerNum +
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+					iSliceCount
+#else
+					1
+#endif//PACKING_ONE_SLICE_PER_LAYER
+					>= MAX_LAYER_NUM_OF_FRAME )	// check available layer_bs_info for further writing as followed
+				{
+					WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iLayerNum(%d) overflow(max:%d) at iDid= %d uiSliceMode= %d, iSliceCount= %d!",
+						iLayerNum, MAX_LAYER_NUM_OF_FRAME, iCurDid, param_d->sMso.uiSliceMode, iSliceCount );
+					return -1;
+				}
+				if ( iSliceCount <= 1 )
+				{
+					WelsLog( pCtx, WELS_LOG_ERROR, "WelsEncoderEncodeExt(), iSliceCount(%d) from GetCurrentSliceNum() is untrusted due stack/heap crupted!\n", iSliceCount );
+					return -1;
+				}
+
+				if ( pSvcParam->iCountThreadsNum >= iSliceCount )	//THREAD_FULLY_FIRE_MODE
+				{
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+					int32_t iSliceIdx = 1;							
+					int32_t iOrgSlicePos[MAX_SLICES_NUM] = {0};
+					iOrgSlicePos[0] = pCtx->iPosBsBuffer;
+					while (uiSliceIdx < iSliceCount)
+					{
+						iOrgSlicePos[uiSliceIdx] = pCtx->pSliceBs[uiSliceIdx].uiBsPos;
+						++ uiSliceIdx;
+					}
+#elif defined(MT_DEBUG)
+					int64_t t_bs_append = 0;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+					
+					pCtx->iActiveThreadsNum	= iSliceCount;
+					// to fire slice coding threads
+					err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, iSliceCount, pCtx->pCurDqLayer->pSliceEncCtx, FALSE );
+					if ( err )
+					{
+						WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
+							err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
+						return -1;
+					}
+				
+					WelsMultipleEventsWaitAllBlocking( iSliceCount, &pCtx->pSliceThreading->pSliceCodedEvent[0] );
+				
+
+					// all slices are finished coding here
+					// append exclusive slice 0 bs to pFrameBs
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+					iLayerSize = pCtx->iPosBsBuffer - iOrgSlicePos[0];
+					uiSliceIdx = 1;
+					while (uiSliceIdx < iSliceCount)
+					{
+						iLayerSize += pCtx->pSliceBs[uiSliceIdx].uiBsPos - iOrgSlicePos[uiSliceIdx];
+						++ uiSliceIdx;
+					}
+					iLayerNum += iSliceCount;	// each slice stickly output as layer info for performance improvement directly
+					pLayerBsInfo += iSliceCount;
+#else
+#if defined(MT_DEBUG)
+					t_bs_append = WelsTime();
+#endif//MT_DEBUG
+					iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, iSliceCount );
+#if defined(MT_DEBUG)
+					t_bs_append = WelsTime() - t_bs_append;
+					if ( pCtx->pSliceThreading->pFSliceDiff )
+					{
+						fprintf(pCtx->pSliceThreading->pFSliceDiff, 
+#if defined(WIN32)
+							"%6I64d us consumed at AppendSliceToFrameBs() for coding_idx: %d iDid: %d qid: %d\n",
+#else
+							"%6lld us consumed at AppendSliceToFrameBs() for coding_idx: %d iDid: %d qid: %d\n",
+#endif//WIN32
+							t_bs_append, pCtx->iCodingIndex, iCurDid, 0 );
+					}
+#endif//MT_DEBUG
+#endif//PACKING_ONE_SLICE_PER_LAYER
+				}
+				else	//THREAD_PICK_UP_MODE
+				{
+					int32_t iNumThreadsRunning = 0;
+					int32_t iNumThreadsScheduled = 0;
+					int32_t iIndexOfSliceToBeCoded = 0;
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+					int32_t iSliceIdx = 1;							
+					int32_t iOrgSlicePos[MAX_SLICES_NUM] = {0};
+					iOrgSlicePos[0] = pCtx->iPosBsBuffer;
+					while (uiSliceIdx < iSliceCount)
+					{
+						iOrgSlicePos[uiSliceIdx] = pCtx->pSliceBs[uiSliceIdx].uiBsPos;
+						++ uiSliceIdx;
+					}
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+					pCtx->iActiveThreadsNum	= pSvcParam->iCountThreadsNum;
+					iNumThreadsScheduled	= pCtx->iActiveThreadsNum;
+					iNumThreadsRunning		= iNumThreadsScheduled;
+					// to fire slice coding threads
+					err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, iNumThreadsRunning, pCtx->pCurDqLayer->pSliceEncCtx, FALSE );
+					if ( err )
+					{
+						WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
+							err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
+						return -1;
+					}
+
+					iIndexOfSliceToBeCoded = iNumThreadsRunning;
+					while (1)
+					{
+						if ( iIndexOfSliceToBeCoded >= iSliceCount && iNumThreadsRunning <= 0 )
+							break;								
+#ifdef WIN32
+						WELS_THREAD_ERROR_CODE lwait	= 0;
+						int32_t iEventId				= -1;
+						
+						lwait = WelsMultipleEventsWaitSingleBlocking(	iNumThreadsScheduled,
+																		&pCtx->pSliceThreading->pSliceCodedEvent[0],
+																		2 );	// 2 ms for one tick
+						iEventId = (int32_t)(lwait - WELS_THREAD_ERROR_WAIT_OBJECT_0);
+						if ( iEventId >= 0 && iEventId < iNumThreadsScheduled )
+						{									
+							if ( iIndexOfSliceToBeCoded < iSliceCount )
+							{		
+								// pick up succeeding slice for threading
+								// thread_id equal to iEventId per implementation here
+								pCtx->pSliceThreading->pThreadPEncCtx[iEventId].iSliceIndex	= iIndexOfSliceToBeCoded;
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+								pCtx->pSliceThreading->pThreadPEncCtx[iEventId].pLayerBs	= pLayerBsInfo+iIndexOfSliceToBeCoded;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+								WelsEventSignal( &pCtx->pSliceThreading->pReadySliceCodingEvent[iEventId] );
+
+								++ iIndexOfSliceToBeCoded;
+							}
+							else	// no other slices left for coding
+							{										
+								-- iNumThreadsRunning;
+							}
+						}
+						else
+						{
+							WelsSleep(1);
+						}								
+#else//__GNUC__
+						// TODO for pthread platforms
+						// alternate implementation using blocking due non-blocking with timeout mode not support at wels thread lib, tune back if available
+						WelsMultipleEventsWaitAllBlocking( iNumThreadsRunning, &pCtx->pSliceThreading->pSliceCodedEvent[0] );
+						if ( iIndexOfSliceToBeCoded < iSliceCount )
+						{
+							int32_t iThreadIdx = 0;
+							// pick up succeeding slices for threading if left
+							while ( iThreadIdx < iNumThreadsScheduled )
+							{
+								if ( iIndexOfSliceToBeCoded >= iSliceCount )
+									break;
+								pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].iSliceIndex = iIndexOfSliceToBeCoded;
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+								pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].pLayerBs = pLayerBsInfo+iIndexOfSliceToBeCoded;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+								WelsEventSignal( pCtx->pSliceThreading->pReadySliceCodingEvent[iThreadIdx] );
+
+								++ iIndexOfSliceToBeCoded;
+								++ iThreadIdx;
+							}
+							// update iNumThreadsRunning
+							iNumThreadsRunning		= iThreadIdx;									
+						}
+						else
+						{
+							iNumThreadsRunning = 0;
+						}
+#endif//WIN32
+					}//while(1)
+
+// all slices are finished coding here
+					// append exclusive slice 0 bs to pFrameBs
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+					iLayerSize = pCtx->iPosBsBuffer - iOrgSlicePos[0];
+					uiSliceIdx = 1;
+					while (uiSliceIdx < iSliceCount)
+					{
+						iLayerSize += pCtx->pSliceBs[uiSliceIdx].uiBsPos - iOrgSlicePos[uiSliceIdx];
+						++ uiSliceIdx;
+					}
+					iLayerNum += iSliceCount;	// each slice stickly output as layer info for performance improvement directly
+					pLayerBsInfo += iSliceCount;
+#else
+					iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, iSliceCount );
+#endif//PACKING_ONE_SLICE_PER_LAYER
+				}
+			}					
+			// THREAD_FULLY_FIRE_MODE && SM_DYN_SLICE
+			else if ( (SM_DYN_SLICE == param_d->sMso.uiSliceMode) && (pSvcParam->iMultipleThreadIdc > 1) )
+			{
+				const int32_t kiPartitionCnt	= pCtx->iActiveThreadsNum; //pSvcParam->iCountThreadsNum;
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+				ResetCountBsSizeInPartitions( pCtx->pSliceThreading->pCountBsSizeInPartition, kiPartitionCnt );
+				pCtx->pCurDqLayer->pSliceEncCtx->iMaxSliceNumConstraint = WELS_MIN ( MAX_SLICES_NUM, DynamicMaxSliceNumConstraint( MAX_LAYER_NUM_OF_FRAME, iLayerNum, 1 + /*( num_qlayer - 1) +*/ ( ( (iCurDid==0) && ( pSvcParam->uiGopSize>1 ) ) ? 1: 0 ) ) );  				
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+				// to fire slice coding threads
+				err = FiredSliceThreads( &pCtx->pSliceThreading->pThreadPEncCtx[0], &pCtx->pSliceThreading->pReadySliceCodingEvent[0], pLayerBsInfo, kiPartitionCnt, pCtx->pCurDqLayer->pSliceEncCtx, TRUE );
+				if ( err )
+				{
+					WelsLog( pCtx, WELS_LOG_ERROR, "[MT] WelsEncoderEncodeExt(), FiredSliceThreads return(%d) failed and exit encoding frame, iCountThreadsNum= %d, iSliceCount= %d, uiSliceMode= %d, iMultipleThreadIdc= %d!!\n",
+						err, pSvcParam->iCountThreadsNum, iSliceCount, param_d->sMso.uiSliceMode, pSvcParam->iMultipleThreadIdc );
+					return -1;
+				}
+
+				WelsMultipleEventsWaitAllBlocking( kiPartitionCnt, &pCtx->pSliceThreading->pSliceCodedEvent[0] );
+
+#if defined(PACKING_ONE_SLICE_PER_LAYER)						
+				iSliceCount = PostProcDynamicSlicingBsWriting( pCtx, pLayerBsInfo, &iLayerSize, kiPartitionCnt );
+				assert(iLayerNum + iSliceCount < MAX_LAYER_NUM_OF_FRAME);
+				pLayerBsInfo += iSliceCount;
+				iLayerNum += iSliceCount;
+#else
+				iLayerSize = AppendSliceToFrameBs( pCtx, pLayerBsInfo, kiPartitionCnt );
+#endif//PACKING_ONE_SLICE_PER_LAYER
+			}
+			else	// for non-dynamic-slicing mode single threading branch..
+#endif//MT_ENABLED
+			{
+				const bool_t bNeedPrefix	= pCtx->bNeedPrefixNalFlag;
+				int32_t iSliceIdx			= 0;
+
+				iSliceCount	= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );						
+				while (iSliceIdx < iSliceCount)
+				{
+					int32_t iSliceSize	= 0;
+
+					if ( bNeedPrefix )
+					{
+						iLayerSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, eNalType, eNalRefIdc );
+					}
+					
+					WelsLoadNal( pCtx->pOut, eNalType, eNalRefIdc );
+					WelsCodeOneSlice( pCtx, iSliceIdx, eNalType );
+					WelsUnloadNal( pCtx->pOut );
+					
+					iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
+													&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+													pCtx->pFrameBs + pCtx->iPosBsBuffer,
+													&iNalLen[iNalIdxInLayer] );
+					pCtx->iPosBsBuffer	+= iSliceSize;
+					iLayerSize	+= iSliceSize;
+					pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;
+
+#if defined(SLICE_INFO_OUTPUT)
+					fprintf(	stderr,
+								"@slice=%-6d sliceType:%c idc:%d size:%-6d\n",
+								iSliceIdx,
+								(pCtx->eSliceType == P_SLICE ? 'P' : 'I'),
+								eNalRefIdc,
+								iSliceSize	);
+#endif//SLICE_INFO_OUTPUT						
+					++ iNalIdxInLayer;
+					++ iSliceIdx;						
+				}						
+
+				pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
+				pLayerBsInfo->uiSpatialId		= iCurDid;
+				pLayerBsInfo->uiTemporalId	= iCurTid;
+				pLayerBsInfo->uiQualityId		= 0;
+				pLayerBsInfo->uiPriorityId	= 0;
+				pLayerBsInfo->iNalCount		= iNalIdxInLayer;
+			}
+		}			
+
+		// deblocking filter
+		if (
+#if defined(MT_ENABLED)
+			(!pCtx->pCurDqLayer->bDeblockingParallelFlag) &&
+#endif//MT_ENABLED
+#if !defined(ENABLE_FRAME_DUMP)
+			( (eNalRefIdc != NRI_PRI_LOWEST) && (param_d->iHighestTemporalId == 0 || iCurTid < param_d->iHighestTemporalId) ) &&
+#endif//!ENABLE_FRAME_DUMP
+			true
+		)
+		{
+			PerformDeblockingFilter( pCtx );
+		}
+
+		// reference picture list update				
+		if ( eNalRefIdc != NRI_PRI_LOWEST )
+		{
+			if ( !WelsUpdateRefList( pCtx ) )
+			{
+				// Force coding IDR as followed
+				ForceCodingIDR( pCtx );
+				WelsLog(pCtx, WELS_LOG_WARNING, "WelsEncoderEncodeExt(), WelsUpdateRefList failed.\n");
+				return -1;
+			}
+		}
+
+		iFrameSize += iLayerSize;				
+
+		pCtx->pFuncList->pfRc.pfWelsRcPictureInfoUpdate(pCtx, iLayerSize);
+
+#ifdef ENABLE_FRAME_DUMP
+		// Dump reconstruction picture for each sQualityStat layer
+		if ( iCurDid+1 < pSvcParam->iNumDependencyLayer )
+			DumpDependencyRec( fsnr, &param_d->sRecFileName[0], iCurDid );
+#endif//ENABLE_FRAME_DUMP
+
+#if defined(ENABLE_PSNR_CALC)
+		snr_y	= WelsCalcPsnr(	fsnr->pData[0],
+							fsnr->iLineSize[0],
+							pEncPic->pData[0],
+							pEncPic->iLineSize[0],
+							iCurWidth,
+							iCurHeight	);
+		snr_u	= WelsCalcPsnr(	fsnr->pData[1],
+							fsnr->iLineSize[1],
+							pEncPic->pData[1],
+							pEncPic->iLineSize[1],
+							(iCurWidth>>1),
+							(iCurHeight>>1)	);
+		snr_v	= WelsCalcPsnr(	fsnr->pData[2],
+							fsnr->iLineSize[2],
+							pEncPic->pData[2],
+							pEncPic->iLineSize[2],
+							(iCurWidth>>1),
+							(iCurHeight>>1)	);
+#endif//ENABLE_PSNR_CALC
+
+#if defined(LAYER_INFO_OUTPUT)
+		fprintf( stderr, "%2s %5d: %-5d %2s   T%1d D%1d Q%-2d  QP%3d   Y%2.2f  U%2.2f  V%2.2f  %8d bits\n",
+				 (iSpatialIdx == 0) ? "#AU" : "   ",
+				 pCtx->iPOC,
+				 pCtx->iFrameNum,
+				 (uiFrameType == WELS_FRAME_TYPE_I || uiFrameType == WELS_FRAME_TYPE_IDR) ? "I": "P",
+				 iCurTid,
+				 iCurDid,
+				 0,
+				 pCtx->pWelsSvcRc[pCtx->uiDependencyId].iAverageFrameQp,
+				 snr_y,
+				 snr_u,
+				 snr_v,
+				 (iLayerSize<<3)	);
+#endif//LAYER_INFO_OUTPUT
+
+#if defined(STAT_OUTPUT)
+
+#if defined(ENABLE_PSNR_CALC)
+		{
+			pCtx->sStatData[iCurDid][0].sQualityStat.rYPsnr[pCtx->eSliceType]	+= snr_y;
+			pCtx->sStatData[iCurDid][0].sQualityStat.rUPsnr[pCtx->eSliceType]	+= snr_u;
+			pCtx->sStatData[iCurDid][0].sQualityStat.rVPsnr[pCtx->eSliceType]	+= snr_v;
+		}
+#endif//ENABLE_PSNR_CALC
+		
+#if defined(MB_TYPES_CHECK) //091025, frame output
+		if (pCtx->eSliceType == P_SLICE)
+		{
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Intra4x4] += pCtx->sPerInfo.iMbCount[P_SLICE][Intra4x4];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Intra16x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Intra16x16];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter16x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter16x16];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter16x8] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter16x8];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter8x16] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter8x16];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][Inter8x8] += pCtx->sPerInfo.iMbCount[P_SLICE][Inter8x8];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][PSkip] += pCtx->sPerInfo.iMbCount[P_SLICE][PSkip];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][8] += pCtx->sPerInfo.iMbCount[P_SLICE][8];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][9] += pCtx->sPerInfo.iMbCount[P_SLICE][9];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][10] += pCtx->sPerInfo.iMbCount[P_SLICE][10];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[P_SLICE][11] += pCtx->sPerInfo.iMbCount[P_SLICE][11];
+		}
+		else if (pCtx->eSliceType == I_SLICE)
+		{
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][Intra4x4] += pCtx->sPerInfo.iMbCount[I_SLICE][Intra4x4];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][Intra16x16] += pCtx->sPerInfo.iMbCount[I_SLICE][Intra16x16];
+			pCtx->sStatData[iCurDid][0].sSliceData.iMbCount[I_SLICE][7] += pCtx->sPerInfo.iMbCount[I_SLICE][7];
+		}
+		
+		memset(pCtx->sPerInfo.iMbCount[P_SLICE], 0, 18*sizeof( int32_t ));
+		memset(pCtx->sPerInfo.iMbCount[I_SLICE], 0, 18*sizeof( int32_t ));
+
+#endif//MB_TYPES_CHECK
+		{ 
+    		//no pCtx->pSvcParam->bMgsT0OnlyStrategy
+			++ pCtx->sStatData[iCurDid][0].sSliceData.iSliceCount[pCtx->eSliceType];	// for multiple slices coding
+			pCtx->sStatData[iCurDid][0].sSliceData.iSliceSize[pCtx->eSliceType]	+= (iLayerSize<<3);	// bits
+		}
+#endif//STAT_OUTPUT
+
+#if defined(MT_ENABLED) && defined(PACKING_ONE_SLICE_PER_LAYER)
+		if ( pSvcParam->iMultipleThreadIdc <= 1 || SM_SINGLE_SLICE == param_d->sMso.uiSliceMode )	// sigle thread actually used
+#else
+		if ( 1 )
+#endif//MT_ENABLED && PACKING_ONE_SLICE_PER_LAYER
+		{
+			++ iLayerNum;
+			++ pLayerBsInfo;
+		}
+
+						
+		pLayerBsInfo->pBsBuf	= pCtx->pFrameBs + pCtx->iPosBsBuffer;
+
+		if( pSvcParam->iPaddingFlag && pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize > 0 )
+		{
+			const int32_t kiPaddingNalSize = WritePadding(pCtx, pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize);
+			
+#if GOM_TRACE_FLAG
+			WelsLog( pCtx, WELS_LOG_INFO,"[RC] encoding_qp%d Padding: %d\n",pCtx->uiDependencyId, pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize);
+#endif
+			if ( kiPaddingNalSize <= 0 )
+				return -1;
+
+			pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingBitrateStat += pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize;
+			
+			pCtx->pWelsSvcRc[pCtx->uiDependencyId].iPaddingSize=0;
+
+			pLayerBsInfo->uiPriorityId	= 0;
+			pLayerBsInfo->uiSpatialId		= 0;
+			pLayerBsInfo->uiTemporalId	= 0;
+			pLayerBsInfo->uiQualityId		= 0;
+			pLayerBsInfo->uiLayerType		= NON_VIDEO_CODING_LAYER;
+			pLayerBsInfo->iNalCount		= 1;
+			pLayerBsInfo->iNalLengthInByte[0] = kiPaddingNalSize;
+			++ pLayerBsInfo;
+			pLayerBsInfo->pBsBuf	= pCtx->pFrameBs + pCtx->iPosBsBuffer;
+			++ iLayerNum;
+		}
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+		if ( param_d->sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc > 1 &&
+			 pSvcParam->iMultipleThreadIdc >= param_d->sMso.sSliceArgument.iSliceNum )
+		{
+			CalcSliceComplexRatio( pCtx->pSliceThreading->pSliceComplexRatio[iCurDid], pCtx->pCurDqLayer->pSliceEncCtx, pCtx->pSliceThreading->pSliceConsumeTime[iCurDid] );
+#if defined(MT_DEBUG)
+			TrackSliceComplexities( pCtx, iCurDid );
+#endif//#if defined(MT_DEBUG)
+		}
+#endif//MT_ENABLED && DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+
+		++ iSpatialIdx;		
+
+		if ( iCurDid+1 < pSvcParam->iNumDependencyLayer )
+		{
+			WelsSwapDqLayers( pCtx );
+		}
+
+		if ( pSvcParam->bEnableLongTermReference && (pCtx->pLtr[pCtx->uiDependencyId].bLTRMarkingFlag && (pCtx->pLtr[pCtx->uiDependencyId].iLTRMarkMode == LTR_DELAY_MARK))) 
+		{
+			pCtx->bLongTermRefFlag[d_idx][0] = true;
+		}
+
+		if ( iCurTid < pCtx->uiSpatialLayersInTemporal[d_idx] - 1 || pSvcParam->iDecompStages == 0 )
+		{
+			if ( (iCurTid >= MAX_TEMPORAL_LEVEL)||(pCtx->uiSpatialLayersInTemporal[d_idx]-1>= MAX_TEMPORAL_LEVEL))
+			{
+				ForceCodingIDR( pCtx );	// some logic error
+				return -1;
+			}
+
+			if ( pSvcParam->bEnableLongTermReference && pCtx->bLongTermRefFlag[d_idx][iCurTid] )
+			{	
+				SPicture *tmp	= pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]+pCtx->pVaa->uiMarkLongTermPicIdx];
+				pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]+pCtx->pVaa->uiMarkLongTermPicIdx] = pCtx->pSpatialPic[d_idx][iCurTid];
+				pCtx->pSpatialPic[d_idx][iCurTid] = pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1];
+				pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1] = tmp;
+				pCtx->bLongTermRefFlag[d_idx][iCurTid] = false;
+			}
+			else
+			{
+				WelsExchangeSpatialPictures( &pCtx->pSpatialPic[d_idx][pCtx->uiSpatialLayersInTemporal[d_idx]-1], &pCtx->pSpatialPic[d_idx][iCurTid] );
+			}
+		}
+
+		if ( pSvcParam->bEnableLongTermReference && ((pCtx->pLtr[pCtx->uiDependencyId].bLTRMarkingFlag && (pCtx->pLtr[pCtx->uiDependencyId].iLTRMarkMode == LTR_DIRECT_MARK)) || eFrameType == WELS_FRAME_TYPE_IDR)) 
+		{
+			pCtx->bLongTermRefFlag[d_idx][iCurTid] = true;
+		}
+	}
+
+#if defined(MT_ENABLED) && defined(MT_DEBUG)
+	TrackSliceConsumeTime( pCtx, did_list, iSpatialNum );
+#endif//MT_ENABLED && MT_DEBUG
+	
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
+	if ( pSvcParam->iMultipleThreadIdc > 1 && did_list[0] == BASE_DEPENDENCY_ID 
+		&& pSvcParam->sDependencyLayers[0].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[0].sMso.sSliceArgument.iSliceNum
+		&& pSvcParam->sDependencyLayers[did_list[iSpatialNum-1]].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pSvcParam->iMultipleThreadIdc >= pSvcParam->sDependencyLayers[did_list[iSpatialNum-1]].sMso.sSliceArgument.iSliceNum )
+	{	
+		AdjustBaseLayer( pCtx );
+	}
+#endif//DYNAMIC_SLICE_ASSIGN
+
+#ifdef ENABLE_FRAME_DUMP
+	DumpRecFrame( fsnr, &pSvcParam->sDependencyLayers[pSvcParam->iNumDependencyLayer-1].sRecFileName[0] );	// pDecPic: final reconstruction output
+#endif//ENABLE_FRAME_DUMP
+	
+	++ pCtx->iCodingIndex;
+	pCtx->eLastNalPriority	= eNalRefIdc;
+	pFbi->iLayerNum			= iLayerNum;
+
+#if defined(X86_ASM)
+	WelsEmms();
+#endif //X86_ASM	
+ 
+	return eFrameType;
+}
+
+/*!
+ * \brief	Wels SVC encoder parameters adjustment
+ *			SVC adjustment results in new requirement in memory blocks adjustment
+ */
+int32_t WelsEncoderParamAdjust( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pNewParam )
+{
+	SWelsSvcCodingParam *pOldParam		= NULL;
+	int32_t iReturn = 0;
+	int8_t iIndexD= 0;
+	bool_t bNeedReset = false;
+
+	if ( NULL == ppCtx || NULL == *ppCtx || NULL == pNewParam )	return 1;
+	
+	/* Check validation in new parameters */
+	iReturn	= ParamValidationExt( pNewParam );
+	if ( iReturn != 0 )	return iReturn;
+
+	pOldParam	= (*ppCtx)->pSvcParam;
+
+	/* Decide whether need reset for IDR frame based on adjusting prarameters changed */
+	/* Temporal levels, spatial settings and/ or quality settings changed need update parameter sets related. */
+	bNeedReset	=	(pOldParam == NULL ) ||
+					(pOldParam->iNumTemporalLayer != pNewParam->iNumTemporalLayer) ||
+					(pOldParam->uiGopSize != pNewParam->uiGopSize) ||
+					(pOldParam->iNumDependencyLayer != pNewParam->iNumDependencyLayer) ||
+					(pOldParam->iDecompStages != pNewParam->iDecompStages) ||
+					(pOldParam->iActualPicWidth != pNewParam->iActualPicWidth || pOldParam->iActualPicHeight != pNewParam->iActualPicHeight) ||
+					(pOldParam->SUsedPicRect.iWidth != pNewParam->SUsedPicRect.iWidth || pOldParam->SUsedPicRect.iHeight != pNewParam->SUsedPicRect.iHeight) ||
+					(pOldParam->bEnableLongTermReference != pNewParam->bEnableLongTermReference);
+	if ( !bNeedReset ){	// Check its picture resolutions/quality settings respectively in each dependency layer
+		iIndexD = 0;
+		assert( pOldParam->iNumDependencyLayer == pNewParam->iNumDependencyLayer );
+		do 
+		{
+			const SDLayerParam *kpOldDlp	= &pOldParam->sDependencyLayers[iIndexD];
+			const SDLayerParam *kpNewDlp	= &pNewParam->sDependencyLayers[iIndexD];
+			float fT1 = .0f;
+			float fT2 = .0f;
+
+			// check frame size settings
+			if ( kpOldDlp->iFrameWidth != kpNewDlp->iFrameWidth ||
+				 kpOldDlp->iFrameHeight != kpNewDlp->iFrameHeight ||
+				 kpOldDlp->iActualWidth != kpNewDlp->iActualWidth ||
+				 kpOldDlp->iActualHeight != kpNewDlp->iActualHeight ){
+				bNeedReset	= true;
+				break;
+			}
+
+			if ( kpOldDlp->sMso.uiSliceMode != kpNewDlp->sMso.uiSliceMode ||				 
+				 kpOldDlp->sMso.sSliceArgument.iSliceNum != kpNewDlp->sMso.sSliceArgument.iSliceNum )
+			{
+				bNeedReset	= true;
+				break;
+			}
+
+			// check frame rate
+			// we can not check whether corresponding fFrameRate is equal or not, 
+			// only need to check d_max/d_min and max_fr/d_max whether it is equal or not
+			if ( kpNewDlp->fInputFrameRate > EPSN && kpOldDlp->fInputFrameRate > EPSN )
+				fT1 = kpNewDlp->fOutputFrameRate/kpNewDlp->fInputFrameRate - kpOldDlp->fOutputFrameRate/kpOldDlp->fInputFrameRate;
+			if ( kpNewDlp->fOutputFrameRate > EPSN && kpOldDlp->fOutputFrameRate > EPSN )
+				fT2 = pNewParam->fMaxFrameRate/kpNewDlp->fOutputFrameRate - pOldParam->fMaxFrameRate/kpOldDlp->fOutputFrameRate;
+			if ( fT1 > EPSN || fT1 < -EPSN || fT2 > EPSN || fT2 < -EPSN )
+			{
+				bNeedReset = true;
+				break;
+			}
+
+			if ( kpOldDlp->iHighestTemporalId != kpNewDlp->iHighestTemporalId )
+			{
+				bNeedReset = true;
+				break;
+			}			
+
+			++ iIndexD;
+		} while (iIndexD < pOldParam->iNumDependencyLayer);		
+	}
+
+	if ( bNeedReset ){
+		SParaSetOffsetVariable sTmpPsoVariable[PARA_SET_TYPE];
+		uint16_t	          uiTmpIdrPicId;//this is for LTR!
+		memcpy( sTmpPsoVariable, (*ppCtx)->sPSOVector.sParaSetOffsetVariable, (PARA_SET_TYPE)*sizeof(SParaSetOffsetVariable)  );// confirmed_safe_unsafe_usage
+		uiTmpIdrPicId = (*ppCtx)->sPSOVector.uiIdrPicId;
+
+		WelsUninitEncoderExt( ppCtx );
+
+		/* Update new parameters */
+		if ( WelsInitEncoderExt( ppCtx, pNewParam ) )
+			return 1;		
+
+		// reset the scaled spatial picture size 
+		(*ppCtx)->pVpp->WelsPreprocessReset(*ppCtx);
+		//if WelsInitEncoderExt succeed
+
+		//for FLEXIBLE_PARASET_ID
+		memcpy( (*ppCtx)->sPSOVector.sParaSetOffsetVariable, sTmpPsoVariable, (PARA_SET_TYPE)*sizeof(SParaSetOffsetVariable)  );// confirmed_safe_unsafe_usage
+		(*ppCtx)->sPSOVector.uiIdrPicId = uiTmpIdrPicId;
+	}
+	else{
+		/* maybe adjustment introduced in bitrate or little settings adjustment and so on.. */		
+		pNewParam->iNumRefFrame								= WELS_CLIP3(pNewParam->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
+		pNewParam->iLoopFilterDisableIdc					= WELS_CLIP3(pNewParam->iLoopFilterDisableIdc, 0, 6);
+		pNewParam->iLoopFilterAlphaC0Offset				= WELS_CLIP3(pNewParam->iLoopFilterAlphaC0Offset, -6, 6);
+		pNewParam->iLoopFilterBetaOffset					= WELS_CLIP3(pNewParam->iLoopFilterBetaOffset, -6, 6);
+		pNewParam->iInterLayerLoopFilterDisableIdc		= WELS_CLIP3(pNewParam->iInterLayerLoopFilterDisableIdc, 0, 6);
+		pNewParam->iInterLayerLoopFilterAlphaC0Offset	= WELS_CLIP3(pNewParam->iInterLayerLoopFilterAlphaC0Offset, -6, 6);
+		pNewParam->iInterLayerLoopFilterBetaOffset		= WELS_CLIP3(pNewParam->iInterLayerLoopFilterBetaOffset, -6, 6);
+		pNewParam->fMaxFrameRate							= WELS_CLIP3(pNewParam->fMaxFrameRate, MIN_FRAME_RATE, MAX_FRAME_RATE);
+
+		// we can not use direct struct based memcpy due some fields need keep unchanged as before
+		pOldParam->fMaxFrameRate	= pNewParam->fMaxFrameRate;		// maximal frame rate [Hz / fps]
+		pOldParam->iInputCsp			= pNewParam->iInputCsp;			// color space of input sequence	
+		pOldParam->uiIntraPeriod		= pNewParam->uiIntraPeriod;		// intra period (multiple of GOP size as desired)
+		pOldParam->bEnableSpsPpsIdAddition = pNewParam->bEnableSpsPpsIdAddition;
+		pOldParam->bPrefixNalAddingCtrl = pNewParam->bPrefixNalAddingCtrl;
+		pOldParam->iNumRefFrame		= pNewParam->iNumRefFrame;		// number of reference frame used
+
+		/* denoise control */
+		pOldParam->bEnableDenoise	= pNewParam->bEnableDenoise;
+
+		/* background detection control */
+		pOldParam->bEnableBackgroundDetection		= pNewParam->bEnableBackgroundDetection;
+
+		/* adaptive quantization control */
+		pOldParam->bEnableAdaptiveQuant	= pNewParam->bEnableAdaptiveQuant;
+
+		/* int32_t term reference control */
+		pOldParam->bEnableLongTermReference	= pNewParam->bEnableLongTermReference;	   
+		pOldParam->uiLtrMarkPeriod	= pNewParam->uiLtrMarkPeriod;	
+
+		// keep below values unchanged as before
+		pOldParam->bEnableSSEI		= pNewParam->bEnableSSEI;
+		pOldParam->bEnableFrameCroppingFlag	= pNewParam->bEnableFrameCroppingFlag;	// enable frame cropping flag
+
+		/* Motion search */
+		
+		/* Deblocking loop filter */
+		pOldParam->iLoopFilterDisableIdc	= pNewParam->iLoopFilterDisableIdc;	// 0: on, 1: off, 2: on except for slice boundaries
+		pOldParam->iLoopFilterAlphaC0Offset	= pNewParam->iLoopFilterAlphaC0Offset;// AlphaOffset: valid range [-6, 6], default 0
+		pOldParam->iLoopFilterBetaOffset		= pNewParam->iLoopFilterBetaOffset;	// BetaOffset:	valid range [-6, 6], default 0
+		pOldParam->iInterLayerLoopFilterDisableIdc	= pNewParam->iInterLayerLoopFilterDisableIdc; // Employed based upon inter-layer, same comment as above
+		pOldParam->iInterLayerLoopFilterAlphaC0Offset	= pNewParam->iInterLayerLoopFilterAlphaC0Offset;	// InterLayerLoopFilterAlphaC0Offset
+		pOldParam->iInterLayerLoopFilterBetaOffset		= pNewParam->iInterLayerLoopFilterBetaOffset;	// InterLayerLoopFilterBetaOffset
+		
+		/* Rate Control */
+		pOldParam->bEnableRc			= pNewParam->bEnableRc;	
+		pOldParam->iRCMode	    	= pNewParam->iRCMode;	
+		pOldParam->iTargetBitrate	= pNewParam->iTargetBitrate;			// overall target bitrate introduced in RC module
+		pOldParam->iPaddingFlag	    = pNewParam->iPaddingFlag;
+		
+		/* Layer definition */
+		pOldParam->bPrefixNalAddingCtrl	= pNewParam->bPrefixNalAddingCtrl;
+
+		// d
+		iIndexD = 0;
+		do 
+		{
+			SDLayerParam *pOldDlp	= &pOldParam->sDependencyLayers[iIndexD];
+			SDLayerParam *pNewDlp	= &pNewParam->sDependencyLayers[iIndexD];
+
+			pOldDlp->fInputFrameRate	= pNewDlp->fInputFrameRate;	// input frame rate
+			pOldDlp->fOutputFrameRate	= pNewDlp->fOutputFrameRate;	// output frame rate
+			pOldDlp->iSpatialBitrate	= pNewDlp->iSpatialBitrate;
+			
+			pOldDlp->uiProfileIdc		= pNewDlp->uiProfileIdc;			// value of profile IDC (0 for auto-detection)
+
+			/* Derived variants below */
+			pOldDlp->iTemporalResolution	= pNewDlp->iTemporalResolution;
+			pOldDlp->iDecompositionStages	= pNewDlp->iDecompositionStages;			
+			
+			memcpy(pOldDlp->uiCodingIdx2TemporalId, pNewDlp->uiCodingIdx2TemporalId, sizeof(pOldDlp->uiCodingIdx2TemporalId));	// confirmed_safe_unsafe_usage
+
+			++ iIndexD;
+		} while (iIndexD < pOldParam->iNumDependencyLayer);		
+	}
+
+	/* Any else initialization/reset for rate control here? */
+	
+	return 0;
+}
+
+
+int32_t WelsCodeOnePicPartition(	sWelsEncCtx *pCtx,
+									SLayerBSInfo *pLayerBsInfo,
+									int32_t *pNalIdxInLayer,									
+									int32_t* pLayerSize,
+									int32_t iFirstMbInPartition,	// first mb inclusive in partition
+									int32_t iEndMbInPartition,	// end mb exclusive in partition
+									int32_t iStartSliceIdx
+								  )
+{
+
+	SDqLayer * pCurLayer			= pCtx->pCurDqLayer;
+	SSliceCtx * pSliceCtx		= pCurLayer->pSliceEncCtx;	
+	int32_t iNalLen[MAX_NAL_UNITS_IN_LAYER]			= {0};
+	int32_t iNalIdxInLayer		= *pNalIdxInLayer;
+	int32_t iSliceIdx				= iStartSliceIdx;
+	const int32_t kiSliceStep		= pCtx->iActiveThreadsNum;
+	const int32_t kiPartitionId		= iStartSliceIdx % kiSliceStep;
+	int32_t iPartitionBsSize		= 0;
+	int32_t iAnyMbLeftInPartition= iEndMbInPartition - iFirstMbInPartition;
+	const EWelsNalUnitType keNalType	= pCtx->eNalType;
+	const EWelsNalRefIdc keNalRefIdc	= pCtx->eNalPriority;
+	const bool_t kbNeedPrefix		= pCtx->bNeedPrefixNalFlag;
+
+	//init
+	{
+		pSliceCtx->pFirstMbInSlice[iSliceIdx]		= iFirstMbInPartition;	
+		pCurLayer->pNumSliceCodedOfPartition[kiPartitionId]	= 1;	// one slice per partition intialized, dynamic slicing inside
+		pCurLayer->pLastMbIdxOfPartition[kiPartitionId]		= iEndMbInPartition-1;
+	}
+	pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = 0;
+
+	while ( iAnyMbLeftInPartition > 0 )
+	{
+		int32_t iSliceSize	= 0;
+
+		if ( iSliceIdx >= pSliceCtx->iMaxSliceNumConstraint )	// insufficient memory in pSliceInLayer[]
+		{
+			// TODO: need exception handler for not large enough of MAX_SLICES_NUM related memory usage
+			// No idea about its solution due MAX_SLICES_NUM is fixed lenght in relevent pData structure
+			return 1;
+		}
+		
+		if ( kbNeedPrefix )
+		{
+			iPartitionBsSize += AddPrefixNal( pCtx, pLayerBsInfo, &iNalLen[0], &iNalIdxInLayer, keNalType, keNalRefIdc );
+		}
+
+		WelsLoadNal( pCtx->pOut, keNalType, keNalRefIdc );
+		WelsCodeOneSlice( pCtx, iSliceIdx, keNalType );
+		WelsUnloadNal( pCtx->pOut );
+
+		iSliceSize = WelsEncodeNalExt(	&pCtx->pOut->sNalList[pCtx->pOut->iNalIndex-1],
+			&pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt,
+			pCtx->pFrameBs + pCtx->iPosBsBuffer,
+			&iNalLen[iNalIdxInLayer]	);
+		pCtx->iPosBsBuffer	+= iSliceSize;
+		iPartitionBsSize	+= iSliceSize;
+		pLayerBsInfo->iNalLengthInByte[iNalIdxInLayer]	= iSliceSize;		
+
+#if defined(SLICE_INFO_OUTPUT)
+		fprintf(	stderr,
+			"@slice=%-6d sliceType:%c idc:%d size:%-6d\n",
+			iSliceIdx,
+			(pCtx->eSliceType == P_SLICE ? 'P' : 'I'),
+			eNalRefIdc,
+			iSliceSize	);
+#endif//SLICE_INFO_OUTPUT
+
+		++ iNalIdxInLayer;
+		iSliceIdx += kiSliceStep;	//if uiSliceIdx is not continuous
+		iAnyMbLeftInPartition = iEndMbInPartition - (1 + pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId]);		
+	}
+
+	*pLayerSize			= iPartitionBsSize;
+	*pNalIdxInLayer	= iNalIdxInLayer;
+
+	// slice based packing???
+	pLayerBsInfo->uiLayerType		= VIDEO_CODING_LAYER;
+	pLayerBsInfo->uiSpatialId		= pCtx->uiDependencyId;
+	pLayerBsInfo->uiTemporalId	= pCtx->uiTemporalId;
+	pLayerBsInfo->uiQualityId		= 0;
+	pLayerBsInfo->uiPriorityId	= 0;
+	pLayerBsInfo->iNalCount		= iNalIdxInLayer;
+
+	return 0;
+}
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/expand_pic.cpp
@@ -1,0 +1,166 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <string.h>
+#include "expand_pic.h"
+#include "cpu_core.h"
+#include "wels_func_ptr_def.h"
+
+namespace WelsSVCEnc {
+
+// rewrite it (split into luma & chroma) that is helpful for mmx/sse2 optimization perform, 9/27/2009
+static inline void ExpandPictureLuma_c( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH )
+{
+	uint8_t *pTmp				= pDst;
+	uint8_t *pDstLastLine		= pTmp + (kiPicH-1) * kiStride;	
+	const int32_t kiPaddingLen	= PADDING_LENGTH;	
+	const uint8_t kuiTL			= pTmp[0];
+	const uint8_t kuiTR			= pTmp[kiPicW-1];
+	const uint8_t kuiBL			= pDstLastLine[0];
+	const uint8_t kuiBR			= pDstLastLine[kiPicW-1];
+	int32_t i					= 0;
+
+	do {
+		const int32_t kiStrides	= (1+i) * kiStride;
+		uint8_t* pTop			= pTmp - kiStrides;
+		uint8_t* pBottom			= pDstLastLine + kiStrides;
+		
+		// pad pTop and pBottom
+		memcpy(pTop, pTmp, kiPicW);				// confirmed_safe_unsafe_usage
+		memcpy(pBottom, pDstLastLine, kiPicW);	// confirmed_safe_unsafe_usage
+		
+		// pad corners
+		memset(pTop-kiPaddingLen, kuiTL, kiPaddingLen); //pTop left
+		memset(pTop+kiPicW, kuiTR, kiPaddingLen); //pTop right
+		memset(pBottom-kiPaddingLen, kuiBL, kiPaddingLen); //pBottom left
+		memset(pBottom+kiPicW, kuiBR, kiPaddingLen); //pBottom right
+		
+		++ i;
+	} while( i < kiPaddingLen );
+
+	// pad left and right
+	i = 0;
+	do {
+		memset(pTmp-kiPaddingLen, pTmp[0], kiPaddingLen);
+		memset(pTmp+kiPicW, pTmp[kiPicW-1], kiPaddingLen);
+
+		pTmp += kiStride;
+		++ i;
+	} while( i < kiPicH );
+}
+
+static inline void ExpandPictureChroma_c( uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH )
+{
+	uint8_t *pTmp				= pDst;
+	uint8_t *pDstLastLine		= pTmp + (kiPicH-1) * kiStride;	
+	const int32_t kiPaddingLen	= (PADDING_LENGTH>>1);	
+	const uint8_t kuiTL			= pTmp[0];
+	const uint8_t kuiTR			= pTmp[kiPicW-1];
+	const uint8_t kuiBL			= pDstLastLine[0];
+	const uint8_t kuiBR			= pDstLastLine[kiPicW-1];
+	int32_t i					= 0;
+	
+	do {
+		const int32_t kiStrides	= (1+i) * kiStride;
+		uint8_t* pTop			= pTmp - kiStrides;
+		uint8_t* pBottom			= pDstLastLine + kiStrides;
+		
+		// pad pTop and pBottom
+		memcpy(pTop, pTmp, kiPicW);				// confirmed_safe_unsafe_usage
+		memcpy(pBottom, pDstLastLine, kiPicW);	// confirmed_safe_unsafe_usage
+		
+		// pad corners
+		memset(pTop-kiPaddingLen, kuiTL, kiPaddingLen); //pTop left
+		memset(pTop+kiPicW, kuiTR, kiPaddingLen); //pTop right
+		memset(pBottom-kiPaddingLen, kuiBL, kiPaddingLen); //pBottom left
+		memset(pBottom+kiPicW, kuiBR, kiPaddingLen); //pBottom right
+		
+		++ i;
+	} while( i < kiPaddingLen );
+	
+	// pad left and right
+	i = 0;
+	do {
+		memset(pTmp-kiPaddingLen, pTmp[0], kiPaddingLen);
+		memset(pTmp+kiPicW, pTmp[kiPicW-1], kiPaddingLen);
+		
+		pTmp += kiStride;
+		++ i;
+	} while( i < kiPicH );
+}
+
+void InitExpandPictureFunc( void *pL, const uint32_t kuiCPUFlag )
+{
+	SWelsFuncPtrList *pFuncList = (SWelsFuncPtrList *)pL;
+	pFuncList->pfExpandLumaPicture		= ExpandPictureLuma_c;
+	pFuncList->pfExpandChromaPicture[0]	= ExpandPictureChroma_c;
+	pFuncList->pfExpandChromaPicture[1]	= ExpandPictureChroma_c;	
+
+#if defined(X86_ASM)
+	if ( (kuiCPUFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2 )
+	{
+		pFuncList->pfExpandLumaPicture	= ExpandPictureLuma_sse2;
+		pFuncList->pfExpandChromaPicture[0]= ExpandPictureChromaUnalign_sse2;
+		pFuncList->pfExpandChromaPicture[1]= ExpandPictureChromaAlign_sse2;
+	}
+#endif//X86_ASM
+}
+
+
+void ExpandReferencingPicture( SPicture *pPic, PExpandPictureFunc pExpLuma, PExpandPictureFunc pExpChrom[2] )
+{		
+	/*local variable*/
+	uint8_t *pPicY	= pPic->pData[0];
+	uint8_t *pPicCb = pPic->pData[1];
+	uint8_t *pPicCr = pPic->pData[2];	
+	const int32_t kiWidthY	= pPic->iWidthInPixel;
+	const int32_t kiHeightY	= pPic->iHeightInPixel;
+	const int32_t kiWidthUV	= kiWidthY >> 1;
+	const int32_t kiHeightUV	= kiHeightY >> 1;	
+
+	pExpLuma(pPicY, pPic->iLineSize[0], kiWidthY, kiHeightY);
+	if ( kiWidthUV >= 16 )
+	{
+		// fix coding picture size as 16x16
+		const bool_t kbChrAligned= /*(iWidthUV >= 16) && */((kiWidthUV & 0x0F) == 0);	// chroma planes: (16+iWidthUV) & 15
+		pExpChrom[kbChrAligned](pPicCb, pPic->iLineSize[1], kiWidthUV, kiHeightUV);
+		pExpChrom[kbChrAligned](pPicCr, pPic->iLineSize[2], kiWidthUV, kiHeightUV);
+	}
+	else
+	{
+		// fix coding picture size as 16x16
+		ExpandPictureChroma_c(pPicCb, pPic->iLineSize[1], kiWidthUV, kiHeightUV);
+		ExpandPictureChroma_c(pPicCr, pPic->iLineSize[2], kiWidthUV, kiHeightUV);
+	}
+}
+
+}
--- /dev/null
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -1,0 +1,789 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	get_intra_predictor.c
+ *
+ * \brief	implementation for get intra predictor about 16x16, 4x4, chroma.
+ *
+ * \date	4/2/2009 Created
+ *			9/14/2009 C level based optimization with high performance gained.
+ *				[const, using ST32/ST64 to replace memset, memcpy and memmove etc.]
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include "macros.h"
+#include "ls_defines.h"
+#include "cpu_core.h"
+#include "get_intra_predictor.h"
+#include "wels_common_basis.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+#define I4x4_COUNT 4
+#define I8x8_COUNT 8
+#define I16x16_COUNT 16
+
+typedef void (*PFillingPred)( uint8_t *pPred, uint8_t *pSrc );
+typedef void (*PFillingPred1to16) ( uint8_t *pPred, const uint8_t kuiSrc );
+
+static inline void WelsFillingPred8to16_c( uint8_t *pPred, uint8_t *pSrc )
+{
+	ST64( pPred  , LD64(pSrc) );
+	ST64( pPred+8, LD64(pSrc) );
+}
+static inline void WelsFillingPred8x2to16_c( uint8_t *pPred, uint8_t *pSrc )
+{
+	ST64( pPred  , LD64(pSrc  ) );
+	ST64( pPred+8, LD64(pSrc+8) );
+}
+static inline void WelsFillingPred1to16_c( uint8_t *pPred, const uint8_t kuiSrc )
+{
+	const uint8_t kuiSrc8[8] = { kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc, kuiSrc };
+	ST64( pPred  , LD64(kuiSrc8));
+	ST64( pPred+8, LD64(kuiSrc8));
+}
+
+PFillingPred					WelsFillingPred8to16;
+PFillingPred					WelsFillingPred8x2to16;
+PFillingPred1to16 WelsFillingPred1to16;
+
+void WelsInitFillingPredFuncs( const uint32_t kuiCpuFlag )
+{
+	WelsFillingPred8to16	= WelsFillingPred8to16_c;
+	WelsFillingPred8x2to16	= WelsFillingPred8x2to16_c;
+	WelsFillingPred1to16	= WelsFillingPred1to16_c;
+
+#if defined(X86_ASM)
+	if ( kuiCpuFlag & WELS_CPU_MMXEXT )
+	{
+		WelsFillingPred8to16		= WelsFillingPred8to16_mmx;
+		WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_mmx;
+		WelsFillingPred1to16		= WelsFillingPred1to16_mmx;
+	}
+	if ( kuiCpuFlag & WELS_CPU_SSE2 )
+	{		
+		WelsFillingPred8x2to16	    = WelsFillingPred8x2to16_sse2;
+		WelsFillingPred1to16		= WelsFillingPred1to16_sse2;
+	}
+#endif//X86_ASM
+}
+
+
+
+#define I4x4_PRED_STRIDE 4
+#define I4x4_PRED_STRIDE2 8
+#define I4x4_PRED_STRIDE3 12
+
+void WelsI4x4LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const uint32_t kuiSrc = LD32(&pRef[-kiStride]);
+	ENFORCE_STACK_ALIGN_1D(uint32_t, uiSrcx2, 2, 16)
+	uiSrcx2[0] = uiSrcx2[1] = kuiSrc;
+	
+	WelsFillingPred8to16( pPred, (uint8_t*)&uiSrcx2[0] );
+}
+
+void WelsI4x4LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const uint32_t kiStridex2Left = (kiStride<<1)-1;
+	const uint32_t kiStridex3Left = kiStride + kiStridex2Left;		
+	const uint8_t kuiHor1 = pRef[-1];
+	const uint8_t kuiHor2 = pRef[kiStride-1];	
+	const uint8_t kuiHor3 = pRef[kiStridex2Left];
+	const uint8_t kuiHor4 = pRef[kiStridex3Left];
+	const uint8_t kuiVec1[4] = {kuiHor1, kuiHor1, kuiHor1, kuiHor1};
+	const uint8_t kuiVec2[4] = {kuiHor2, kuiHor2, kuiHor2, kuiHor2};
+	const uint8_t kuiVec3[4] = {kuiHor3, kuiHor3, kuiHor3, kuiHor3};
+	const uint8_t kuiVec4[4] = {kuiHor4, kuiHor4, kuiHor4, kuiHor4};
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	ST32(&uiSrc[0], LD32(kuiVec1));
+	ST32(&uiSrc[4], LD32(kuiVec2));
+	ST32(&uiSrc[8], LD32(kuiVec3));
+	ST32(&uiSrc[12], LD32(kuiVec4));
+	
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+void WelsI4x4LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const uint8_t kuiDcValue	= ( pRef[-1] + pRef[kiStride-1] + pRef[(kiStride<<1)-1] + pRef[(kiStride<<1)+kiStride-1] +
+								pRef[-kiStride] + pRef[1-kiStride] + pRef[2-kiStride] + pRef[3-kiStride] + 4 ) >> 3;
+
+	WelsFillingPred1to16( pPred, kuiDcValue );
+}
+
+void WelsI4x4LumaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const uint8_t kuiDcValue	= (pRef[-1] + pRef[kiStride-1] + pRef[(kiStride<<1)-1] + pRef[(kiStride<<1)+kiStride-1] + 2)>>2;
+
+	WelsFillingPred1to16( pPred, kuiDcValue );	
+}
+
+void WelsI4x4LumaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const uint8_t kuiDcValue	= (pRef[-kiStride] + pRef[1-kiStride] + pRef[2-kiStride] + pRef[3-kiStride] + 2) >> 2;
+
+	WelsFillingPred1to16( pPred, kuiDcValue );	
+}
+
+void WelsI4x4LumaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const uint8_t kuiDcValue = 0x80;
+
+	WelsFillingPred1to16( pPred, kuiDcValue );
+}
+
+/*down pLeft*/
+void WelsI4x4LumaPredDDL_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	/*get pTop*/
+	const uint8_t kuiT0		= pRef[-kiStride];
+	const uint8_t kuiT1		= pRef[1-kiStride];
+	const uint8_t kuiT2		= pRef[2-kiStride];
+	const uint8_t kuiT3		= pRef[3-kiStride];
+	const uint8_t kuiT4		= pRef[4-kiStride];
+	const uint8_t kuiT5		= pRef[5-kiStride];
+	const uint8_t kuiT6		= pRef[6-kiStride];
+	const uint8_t kuiT7		= pRef[7-kiStride];
+	const uint8_t kuiDDL0	= (2 + kuiT0 + kuiT2 + (kuiT1<<1))>>2;	// uiDDL0
+	const uint8_t kuiDDL1	= (2 + kuiT1 + kuiT3 + (kuiT2<<1))>>2;	// uiDDL1
+	const uint8_t kuiDDL2	= (2 + kuiT2 + kuiT4 + (kuiT3<<1))>>2;	// uiDDL2
+	const uint8_t kuiDDL3	= (2 + kuiT3 + kuiT5 + (kuiT4<<1))>>2;	// uiDDL3
+	const uint8_t kuiDDL4	= (2 + kuiT4 + kuiT6 + (kuiT5<<1))>>2;	// uiDDL4
+	const uint8_t kuiDDL5	= (2 + kuiT5 + kuiT7 + (kuiT6<<1))>>2;	// uiDDL5
+	const uint8_t kuiDDL6	= (2 + kuiT6 + kuiT7 + (kuiT7<<1))>>2;	// uiDDL6
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	uiSrc[0] = kuiDDL0;
+	uiSrc[1] = uiSrc[4] = kuiDDL1;
+	uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDDL2;
+	uiSrc[3] = uiSrc[6] = uiSrc[9] = uiSrc[12] = kuiDDL3;
+	uiSrc[7] = uiSrc[10]= uiSrc[13]= kuiDDL4;
+	uiSrc[11]= uiSrc[14]= kuiDDL5;
+	uiSrc[15] = kuiDDL6;
+
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+
+/*down pLeft*/
+void WelsI4x4LumaPredDDLTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	/*get pTop*/
+	const uint8_t kuiT0	= pRef[-kiStride];
+	const uint8_t kuiT1	= pRef[1-kiStride];
+	const uint8_t kuiT2	= pRef[2-kiStride];
+	const uint8_t kuiT3	= pRef[3-kiStride];
+	const uint8_t kuiDLT0	= (2 + kuiT0 + kuiT2 + (kuiT1<<1))>>2;	// uiDLT0
+	const uint8_t kuiDLT1	= (2 + kuiT1 + kuiT3 + (kuiT2<<1))>>2;	// uiDLT1
+	const uint8_t kuiDLT2	= (2 + kuiT2 + kuiT3 + (kuiT3<<1))>>2;	// uiDLT2
+	const uint8_t kuiDLT3	= (2 + (kuiT3<<2))>>2;				// uiDLT3
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	memset(&uiSrc[6], kuiDLT3, 10*sizeof(uint8_t));
+	uiSrc[0] = kuiDLT0;
+	uiSrc[1] = uiSrc[4] = kuiDLT1;
+	uiSrc[2] = uiSrc[5] = uiSrc[8] = kuiDLT2;
+	uiSrc[3] = kuiDLT3;
+
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+
+
+/*down right*/
+void WelsI4x4LumaPredDDR_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const int32_t kiStridex2	= kiStride<<1;
+	const int32_t kiStridex3	= kiStride + kiStridex2;
+	const uint8_t kuiLT			= pRef[-kiStride-1];	// pTop-pLeft
+	/*get pLeft and pTop*/
+	const uint8_t kuiL0			= pRef[-1];
+	const uint8_t kuiL1			= pRef[kiStride-1];
+	const uint8_t kuiL2			= pRef[kiStridex2-1];
+	const uint8_t kuiL3			= pRef[kiStridex3-1];
+	const uint8_t kuiT0			= pRef[-kiStride];
+	const uint8_t kuiT1			= pRef[1-kiStride];
+	const uint8_t kuiT2			= pRef[2-kiStride];
+	const uint8_t kuiT3			= pRef[3-kiStride];
+	const uint16_t kuiTL0		= 1 + kuiLT + kuiL0;
+	const uint16_t kuiLT0		= 1 + kuiLT + kuiT0;
+	const uint16_t kuiT01		= 1 + kuiT0 + kuiT1;
+	const uint16_t kuiT12		= 1 + kuiT1 + kuiT2;
+	const uint16_t kuiT23		= 1 + kuiT2 + kuiT3;
+	const uint16_t kuiL01		= 1 + kuiL0 + kuiL1;
+	const uint16_t kuiL12		= 1 + kuiL1 + kuiL2;
+	const uint16_t kuiL23		= 1 + kuiL2 + kuiL3;
+	const uint8_t kuiDDR0		= (kuiTL0 + kuiLT0) >> 2;
+	const uint8_t kuiDDR1		= (kuiLT0 + kuiT01) >> 2;
+	const uint8_t kuiDDR2		= (kuiT01 + kuiT12) >> 2;
+	const uint8_t kuiDDR3		= (kuiT12 + kuiT23) >> 2;
+	const uint8_t kuiDDR4		= (kuiTL0 + kuiL01) >> 2;
+	const uint8_t kuiDDR5		= (kuiL01 + kuiL12) >> 2;
+	const uint8_t kuiDDR6		= (kuiL12 + kuiL23) >> 2;
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	uiSrc[0] = uiSrc[5] = uiSrc[10] = uiSrc[15] = kuiDDR0;
+	uiSrc[1] = uiSrc[6] = uiSrc[11] = kuiDDR1;
+	uiSrc[2] = uiSrc[7] = kuiDDR2;
+	uiSrc[3] = kuiDDR3;
+	uiSrc[4] = uiSrc[9] = uiSrc[14] = kuiDDR4;
+	uiSrc[8] = uiSrc[13] = kuiDDR5;
+	uiSrc[12]= kuiDDR6;
+
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+
+
+/*vertical pLeft*/
+void WelsI4x4LumaPredVL_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	/*get pTop*/
+	const uint8_t kuiT0		= pRef[-kiStride];
+	const uint8_t kuiT1		= pRef[1-kiStride];
+	const uint8_t kuiT2		= pRef[2-kiStride];
+	const uint8_t kuiT3		= pRef[3-kiStride];
+	const uint8_t kuiT4		= pRef[4-kiStride];
+	const uint8_t kuiT5		= pRef[5-kiStride];
+	const uint8_t kuiT6		= pRef[6-kiStride];
+	const uint8_t kuiVL0	= (1 + kuiT0 + kuiT1)>>1;				// uiVL0
+	const uint8_t kuiVL1	= (1 + kuiT1 + kuiT2)>>1;				// uiVL1
+	const uint8_t kuiVL2	= (1 + kuiT2 + kuiT3)>>1;				// uiVL2
+	const uint8_t kuiVL3	= (1 + kuiT3 + kuiT4)>>1;				// uiVL3
+	const uint8_t kuiVL4	= (1 + kuiT4 + kuiT5)>>1;				// uiVL4
+	const uint8_t kuiVL5	= (2 + kuiT0 + (kuiT1<<1) + kuiT2)>>2;	// uiVL5
+	const uint8_t kuiVL6	= (2 + kuiT1 + (kuiT2<<1) + kuiT3)>>2;	// uiVL6
+	const uint8_t kuiVL7	= (2 + kuiT2 + (kuiT3<<1) + kuiT4)>>2;	// uiVL7
+	const uint8_t kuiVL8	= (2 + kuiT3 + (kuiT4<<1) + kuiT5)>>2;	// uiVL8
+	const uint8_t kuiVL9	= (2 + kuiT4 + (kuiT5<<1) + kuiT6)>>2;	// uiVL9
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	uiSrc[0] = kuiVL0;
+	uiSrc[1] = uiSrc[8] = kuiVL1;
+	uiSrc[2] = uiSrc[9] = kuiVL2;
+	uiSrc[3] = uiSrc[10]= kuiVL3;
+	uiSrc[4] = kuiVL5;
+	uiSrc[5] = uiSrc[12] = kuiVL6;
+	uiSrc[6] = uiSrc[13] = kuiVL7;
+	uiSrc[7] = uiSrc[14] = kuiVL8;
+	uiSrc[11]= kuiVL4;
+	uiSrc[15]= kuiVL9;
+
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+
+
+
+/*vertical pLeft*/
+void WelsI4x4LumaPredVLTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	uint8_t *pTopLeft		= &pRef[-kiStride-1];	// pTop-pLeft
+	/*get pTop*/
+	const uint8_t kuiT0		= *(pTopLeft+1);
+	const uint8_t kuiT1		= *(pTopLeft+2);
+	const uint8_t kuiT2		= *(pTopLeft+3);
+	const uint8_t kuiT3		= *(pTopLeft+4);
+	const uint8_t kuiVLT0	= (1 + kuiT0 + kuiT1)>>1;				// uiVLT0
+	const uint8_t kuiVLT1	= (1 + kuiT1 + kuiT2)>>1;				// uiVLT1
+	const uint8_t kuiVLT2	= (1 + kuiT2 + kuiT3)>>1;				// uiVLT2
+	const uint8_t kuiVLT3	= (1 + (kuiT3<<1))>>1;				// uiVLT3
+	const uint8_t kuiVLT4	= (2 + kuiT0 + (kuiT1<<1) + kuiT2)>>2;	// uiVLT4
+	const uint8_t kuiVLT5	= (2 + kuiT1 + (kuiT2<<1) + kuiT3)>>2;	// uiVLT5
+	const uint8_t kuiVLT6	= (2 + kuiT2 + (kuiT3<<1) + kuiT3)>>2;	// uiVLT6
+	const uint8_t kuiVLT7	= (2 + (kuiT3<<2))>>2;				// uiVLT7
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	uiSrc[0] = kuiVLT0;
+	uiSrc[1] = uiSrc[8] = kuiVLT1;
+	uiSrc[2] = uiSrc[9] = kuiVLT2;
+	uiSrc[3] = uiSrc[10]= uiSrc[11] = kuiVLT3;
+	uiSrc[4] = kuiVLT4;
+	uiSrc[5] = uiSrc[12] = kuiVLT5;
+	uiSrc[6] = uiSrc[13] = kuiVLT6;
+	uiSrc[7] = uiSrc[14] = uiSrc[15] = kuiVLT7;
+
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+
+/*vertical right*/
+void WelsI4x4LumaPredVR_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const int32_t kiStridex2	= kiStride<<1;
+	const uint8_t kuiLT			= pRef[-kiStride-1];	// pTop-pLeft
+	/*get pLeft and pTop*/
+	const uint8_t kuiL0			= pRef[-1];
+	const uint8_t kuiL1			= pRef[kiStride-1];
+	const uint8_t kuiL2			= pRef[kiStridex2-1];
+	const uint8_t kuiT0			= pRef[-kiStride];
+	const uint8_t kuiT1			= pRef[1-kiStride];
+	const uint8_t kuiT2			= pRef[2-kiStride];
+	const uint8_t kuiT3			= pRef[3-kiStride];
+	const uint8_t kuiVR0		= (1 + kuiLT + kuiT0) >> 1;
+	const uint8_t kuiVR1		= (1 + kuiT0 + kuiT1) >> 1;
+	const uint8_t kuiVR2		= (1 + kuiT1 + kuiT2) >> 1;
+	const uint8_t kuiVR3		= (1 + kuiT2 + kuiT3) >> 1;
+	const uint8_t kuiVR4		= (2 + kuiL0 + (kuiLT << 1) + kuiT0) >> 2;
+	const uint8_t kuiVR5		= (2 + kuiLT + (kuiT0 << 1) + kuiT1) >> 2;
+	const uint8_t kuiVR6		= (2 + kuiT0 + (kuiT1 << 1) + kuiT2) >> 2;
+	const uint8_t kuiVR7		= (2 + kuiT1 + (kuiT2 << 1) + kuiT3) >> 2; 
+	const uint8_t kuiVR8		= (2 + kuiLT + (kuiL0 << 1) + kuiL1) >> 2;
+	const uint8_t kuiVR9		= (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2;
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	uiSrc[0] = uiSrc[9] = kuiVR0;
+	uiSrc[1] = uiSrc[10] = kuiVR1;
+	uiSrc[2] = uiSrc[11] = kuiVR2;
+	uiSrc[3] = kuiVR3;
+	uiSrc[4] = uiSrc[13] = kuiVR4;
+	uiSrc[5] = uiSrc[14] = kuiVR5;
+	uiSrc[6] = uiSrc[15] = kuiVR6;
+	uiSrc[7] = kuiVR7;
+	uiSrc[8] = kuiVR8;
+	uiSrc[12]= kuiVR9;
+
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+
+
+/*horizontal up*/
+void WelsI4x4LumaPredHU_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const int32_t kiStridex2	= kiStride<<1;
+	const int32_t kiStridex3	= kiStride + kiStridex2;
+	/*get pLeft*/
+	const uint8_t kuiL0			= pRef[-1];
+	const uint8_t kuiL1			= pRef[kiStride-1];
+	const uint8_t kuiL2			= pRef[kiStridex2-1];
+	const uint8_t kuiL3			= pRef[kiStridex3-1];
+	const uint16_t kuiL01		= (1 + kuiL0 + kuiL1);
+	const uint16_t kuiL12		= (1 + kuiL1 + kuiL2);
+	const uint16_t kuiL23		= (1 + kuiL2 + kuiL3);
+	const uint8_t kuiHU0		= kuiL01 >> 1;
+	const uint8_t kuiHU1		= (kuiL01 + kuiL12) >> 2;
+	const uint8_t kuiHU2		= kuiL12 >> 1;
+	const uint8_t kuiHU3		= (kuiL12 + kuiL23) >> 2;
+	const uint8_t kuiHU4		= kuiL23 >> 1;
+	const uint8_t kuiHU5		= (1 + kuiL23 + (kuiL3 << 1)) >> 2;
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	uiSrc[0] = kuiHU0;
+	uiSrc[1] = kuiHU1;
+	uiSrc[2] = uiSrc[4] = kuiHU2;
+	uiSrc[3] = uiSrc[5] = kuiHU3;
+	uiSrc[6] = uiSrc[8] = kuiHU4;
+	uiSrc[7] = uiSrc[9] = kuiHU5;
+	memset(&uiSrc[10], kuiL3, 6*sizeof(uint8_t));
+
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+
+
+/*horizontal down*/
+void WelsI4x4LumaPredHD_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const int32_t kiStridex2	= kiStride<<1;
+	const int32_t kiStridex3	= kiStride + kiStridex2;
+	const uint8_t kuiLT		= pRef[-kiStride-1];	// pTop-pLeft
+	/*get pLeft and pTop*/
+	const uint8_t kuiL0		= pRef[-1];
+	const uint8_t kuiL1		= pRef[kiStride-1];
+	const uint8_t kuiL2		= pRef[kiStridex2-1];
+	const uint8_t kuiL3		= pRef[kiStridex3-1];
+	const uint8_t kuiT0		= pRef[-kiStride];
+	const uint8_t kuiT1		= pRef[1-kiStride];
+	const uint8_t kuiT2		= pRef[2-kiStride];
+	const uint8_t kuiHD0		= (1 + kuiLT + kuiL0)>>1;				// uiHD0 
+	const uint8_t kuiHD1		= (2 + kuiL0 + (kuiLT<<1) + kuiT0)>>2;	// uiHD1
+	const uint8_t kuiHD2		= (2 + kuiLT + (kuiT0<<1) + kuiT1)>>2;	// uiHD2
+	const uint8_t kuiHD3		= (2 + kuiT0 + (kuiT1<<1) + kuiT2)>>2;	// uiHD3
+	const uint8_t kuiHD4		= (1 + kuiL0 + kuiL1)>>1;				// uiHD4
+	const uint8_t kuiHD5		= (2 + kuiLT + (kuiL0<<1) + kuiL1)>>2;	// uiHD5
+	const uint8_t kuiHD6		= (1 + kuiL1 + kuiL2)>>1;				// uiHD6
+	const uint8_t kuiHD7		= (2 + kuiL0 + (kuiL1<<1) + kuiL2)>>2;	// uiHD7
+	const uint8_t kuiHD8		= (1 + kuiL2 + kuiL3)>>1;				// uiHD8
+	const uint8_t kuiHD9		= (2 + kuiL1 + (kuiL2<<1) + kuiL3)>>2;	// uiHD9
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiSrc, 16, 16)	// TobeCont'd about assign opt as follows
+	uiSrc[0] = uiSrc[6] = kuiHD0;
+	uiSrc[1] = uiSrc[7] = kuiHD1;
+	uiSrc[2] = kuiHD2;
+	uiSrc[3] = kuiHD3;
+	uiSrc[4] = uiSrc[10] = kuiHD4;
+	uiSrc[5] = uiSrc[11] = kuiHD5;
+	uiSrc[8] = uiSrc[14] = kuiHD6;
+	uiSrc[9] = uiSrc[15] = kuiHD7;
+	uiSrc[12] = kuiHD8;
+	uiSrc[13] = kuiHD9;
+
+	WelsFillingPred8x2to16( pPred, uiSrc );
+}
+
+
+
+#define I8x8_PRED_STRIDE 8
+
+void WelsIChormaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const uint64_t kuiSrc64 = LD64(&pRef[-kiStride]);	
+
+	ST64( pPred   , kuiSrc64 );
+	ST64( pPred+8 , kuiSrc64 );
+	ST64( pPred+16, kuiSrc64 );
+	ST64( pPred+24, kuiSrc64 );
+	ST64( pPred+32, kuiSrc64 );
+	ST64( pPred+40, kuiSrc64 );
+	ST64( pPred+48, kuiSrc64 );
+	ST64( pPred+56, kuiSrc64 );	
+}
+
+void WelsIChormaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	int32_t iStridex7 = (kiStride<<3)-kiStride;
+	int32_t iI8x8Stridex7 = (I8x8_PRED_STRIDE<<3)-I8x8_PRED_STRIDE;
+	uint8_t i = 7;
+	
+	do
+	{
+		const uint8_t kuiLeft = pRef[iStridex7-1];	// pLeft value
+#ifdef _MSC_VER
+		uint64_t kuiSrc64 = (uint64_t)(0x0101010101010101U * kuiLeft);
+#else
+		uint64_t kuiSrc64 = (uint64_t)(0x0101010101010101LL * kuiLeft);
+#endif
+		ST64( pPred+iI8x8Stridex7, kuiSrc64 );
+
+		iStridex7 -= kiStride;
+		iI8x8Stridex7 -= I8x8_PRED_STRIDE;
+	}while(i-->0);
+}
+
+
+void WelsIChormaPredPlane_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	int32_t iLTshift=0, iTopshift=0, iLeftshift=0, iTopSum=0, iLeftSum=0;
+	int32_t i, j;
+	uint8_t *pTop = &pRef[-kiStride];
+	uint8_t *pLeft = &pRef[-1];
+
+	for(i = 0 ; i < 4 ; i ++)
+	{
+		iTopSum += (i + 1) * (pTop[4 + i] - pTop[2 - i]);
+		iLeftSum += (i + 1) * (pLeft[(4 + i)*kiStride] - pLeft[(2 - i)*kiStride]);
+	}
+
+	iLTshift = (pLeft[7*kiStride] + pTop[7]) << 4;
+	iTopshift = (17 * iTopSum + 16) >> 5;
+	iLeftshift = (17 * iLeftSum + 16) >> 5;
+
+	for(i = 0 ; i < 8 ; i ++)
+	{
+		for(j = 0 ; j < 8 ; j ++)
+		{			
+			pPred[j] = (uint8_t)WELS_CLIP1((iLTshift + iTopshift * (j - 3) + iLeftshift * (i - 3) + 16) >> 5);
+		}
+		pPred += I8x8_PRED_STRIDE;
+	}
+}
+
+
+void WelsIChormaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const int32_t kuiL1 = kiStride-1;
+	const int32_t kuiL2 = kuiL1 + kiStride;
+	const int32_t kuiL3 = kuiL2 + kiStride;
+	const int32_t kuiL4 = kuiL3 + kiStride;
+	const int32_t kuiL5 = kuiL4 + kiStride;
+	const int32_t kuiL6 = kuiL5 + kiStride;
+	const int32_t kuiL7 = kuiL6 + kiStride;
+	/*caculate the iMean value*/
+	const uint8_t kuiMean1	= (	pRef[-kiStride] + pRef[1-kiStride] + pRef[2-kiStride] + pRef[3-kiStride] +
+							pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 4 ) >> 3;
+	const uint32_t kuiSum2 = pRef[4-kiStride] + pRef[5-kiStride] + pRef[6-kiStride] + pRef[7-kiStride];
+	const uint32_t kuiSum3 = pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7];
+	const uint8_t kuiMean2 = (kuiSum2 + 2) >> 2;
+	const uint8_t kuiMean3 = (kuiSum3 + 2) >> 2;
+	const uint8_t kuiMean4 = (kuiSum2 + kuiSum3 + 4) >> 3;
+
+	const uint8_t kuiTopMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
+	const uint8_t kuiBottomMean[8] = {kuiMean3, kuiMean3, kuiMean3, kuiMean3, kuiMean4, kuiMean4, kuiMean4, kuiMean4};
+	const uint64_t kuiTopMean64 = LD64(kuiTopMean);
+	const uint64_t kuiBottomMean64 = LD64(kuiBottomMean);
+
+	ST64( pPred   , kuiTopMean64 );
+	ST64( pPred+8 , kuiTopMean64 );
+	ST64( pPred+16, kuiTopMean64 );
+	ST64( pPred+24, kuiTopMean64 );
+	ST64( pPred+32, kuiBottomMean64 );
+	ST64( pPred+40, kuiBottomMean64 );
+	ST64( pPred+48, kuiBottomMean64 );
+	ST64( pPred+56, kuiBottomMean64 );
+}
+
+void WelsIChormaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	const int32_t kuiL1	= kiStride-1;
+	const int32_t kuiL2	= kuiL1 + kiStride;
+	const int32_t kuiL3	= kuiL2 + kiStride;
+	const int32_t kuiL4	= kuiL3 + kiStride;
+	const int32_t kuiL5	= kuiL4 + kiStride;
+	const int32_t kuiL6	= kuiL5 + kiStride;
+	const int32_t kuiL7	= kuiL6 + kiStride;
+	/*caculate the iMean value*/
+	const uint8_t kuiTopMean	= (pRef[-1] + pRef[kuiL1] + pRef[kuiL2] + pRef[kuiL3] + 2)>>2 ;
+	const uint8_t kuiBottomMean	= (pRef[kuiL4] + pRef[kuiL5] + pRef[kuiL6] + pRef[kuiL7] + 2)>>2;
+#ifdef _MSC_VER
+	const uint64_t kuiTopMean64	= (uint64_t)(0x0101010101010101U * kuiTopMean);
+	const uint64_t kuiBottomMean64	= (uint64_t)(0x0101010101010101U * kuiBottomMean);
+#else
+	const uint64_t kuiTopMean64	= (uint64_t)(0x0101010101010101LL * kuiTopMean);
+	const uint64_t kuiBottomMean64	= (uint64_t)(0x0101010101010101LL * kuiBottomMean);
+#endif
+	ST64( pPred   , kuiTopMean64 );
+	ST64( pPred+8 , kuiTopMean64 );
+	ST64( pPred+16, kuiTopMean64 );
+	ST64( pPred+24, kuiTopMean64 );
+	ST64( pPred+32, kuiBottomMean64 );
+	ST64( pPred+40, kuiBottomMean64 );
+	ST64( pPred+48, kuiBottomMean64 );
+	ST64( pPred+56, kuiBottomMean64 );	
+}
+
+void WelsIChormaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	/*caculate the iMean value*/
+	const uint8_t kuiMean1 = (pRef[-kiStride] + pRef[1-kiStride] + pRef[2-kiStride] + pRef[3-kiStride]+2)>>2;
+	const uint8_t kuiMean2 = (pRef[4-kiStride] + pRef[5-kiStride] + pRef[6-kiStride] + pRef[7-kiStride] + 2)>>2;
+	const uint8_t kuiMean[8] = {kuiMean1, kuiMean1, kuiMean1, kuiMean1, kuiMean2, kuiMean2, kuiMean2, kuiMean2};
+	const uint64_t kuiMean64 = LD64(kuiMean);
+
+	ST64( pPred   , kuiMean64 );
+	ST64( pPred+8 , kuiMean64 );
+	ST64( pPred+16, kuiMean64 );
+	ST64( pPred+24, kuiMean64 );
+	ST64( pPred+32, kuiMean64 );
+	ST64( pPred+40, kuiMean64 );
+	ST64( pPred+48, kuiMean64 );
+	ST64( pPred+56, kuiMean64 );	
+}
+
+void WelsIChormaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+#ifdef _MSC_VER
+	const uint64_t kuiDcValue64 = (uint64_t)0x8080808080808080U;
+#else
+	const uint64_t kuiDcValue64 = (uint64_t)0x8080808080808080LL;
+#endif
+	ST64( pPred   , kuiDcValue64 );
+	ST64( pPred+8 , kuiDcValue64 );
+	ST64( pPred+16, kuiDcValue64 );
+	ST64( pPred+24, kuiDcValue64 );
+	ST64( pPred+32, kuiDcValue64 );
+	ST64( pPred+40, kuiDcValue64 );
+	ST64( pPred+48, kuiDcValue64 );
+	ST64( pPred+56, kuiDcValue64 );
+}
+
+
+void WelsI16x16LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	uint8_t i = 15;
+	const int8_t *kpSrc = (int8_t*)&pRef[-kiStride];
+	const uint64_t kuiT1 = LD64(kpSrc  );
+	const uint64_t kuiT2 = LD64(kpSrc+8);
+	uint8_t *pDst = pPred;
+
+	do
+	{
+		ST64(pDst  , kuiT1);
+		ST64(pDst+8, kuiT2);
+		pDst += 16;
+	}while(i-->0);
+}
+
+void WelsI16x16LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	int32_t iStridex15 = (kiStride<<4)-kiStride;
+	int32_t iPredStride = 16;
+	int32_t iPredStridex15 = 240;	//(iPredStride<<4)-iPredStride;
+	uint8_t i = 15;
+	
+	do
+	{
+		const uint8_t kuiSrc8	= pRef[iStridex15-1];
+#ifdef _MSC_VER
+		const uint64_t kuiV64	= (uint64_t)(0x0101010101010101U * kuiSrc8);
+#else
+		const uint64_t kuiV64	= (uint64_t)(0x0101010101010101LL * kuiSrc8);
+#endif			
+		ST64( &pPred[iPredStridex15], kuiV64 );
+		ST64( &pPred[iPredStridex15+8], kuiV64 );		
+
+		iStridex15 -= kiStride;
+		iPredStridex15 -= iPredStride;
+	}while(i-->0);
+}
+
+void WelsI16x16LumaPredPlane_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	int32_t iLTshift=0, iTopshift=0, iLeftshift=0, iTopSum=0, iLeftSum=0;
+	int32_t i, j;
+	uint8_t *pTop = &pRef[-kiStride];
+	uint8_t *pLeft = &pRef[-1];
+	int32_t iPredStride = 16;
+
+	for(i = 0 ; i < 8 ; i ++)
+	{
+		iTopSum += (i + 1) * (pTop[8 + i] - pTop[6 - i]);
+		iLeftSum += (i + 1) * (pLeft[(8 + i)*kiStride] - pLeft[(6 - i)*kiStride]);
+	}
+
+	iLTshift = (pLeft[15*kiStride] + pTop[15]) << 4;
+	iTopshift = (5 * iTopSum + 32) >> 6;
+	iLeftshift = (5 * iLeftSum + 32) >> 6;
+
+	for(i = 0 ; i < 16 ; i ++)
+	{
+		for(j = 0 ; j < 16 ; j ++)
+		{			
+			pPred[j] = (uint8_t)WELS_CLIP1((iLTshift + iTopshift * (j - 7) + iLeftshift * (i - 7) + 16) >> 5);
+		}
+		pPred += iPredStride;
+	}
+}
+
+void WelsI16x16LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	int32_t iStridex15 = (kiStride<<4)-kiStride;
+	int32_t iSum = 0;
+	uint8_t i = 15;
+	uint8_t iMean = 0;
+
+	/*caculate the iMean value*/
+	do
+	{
+		iSum += pRef[-1+iStridex15] + pRef[-kiStride+i];
+		iStridex15 -= kiStride;
+	}while(i-->0);
+	iMean = ( 16 + iSum ) >> 5;
+	memset(pPred, iMean, 256);
+}
+
+
+void WelsI16x16LumaPredDcTop_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	int32_t iSum = 0;
+	uint8_t i = 15;
+	uint8_t iMean = 0;
+	
+	/*caculate the iMean value*/
+	do
+	{
+		iSum += pRef[-kiStride+i];
+	}while(i-->0);
+	iMean = ( 8 + iSum ) >> 4;
+	memset(pPred, iMean, 256);
+}
+
+void WelsI16x16LumaPredDcLeft_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	int32_t iStridex15 = (kiStride<<4)-kiStride;
+	int32_t iSum = 0;
+	uint8_t i = 15;
+	uint8_t iMean = 0;
+
+	/*caculate the iMean value*/
+	do
+	{
+		iSum += pRef[-1+iStridex15];
+		iStridex15 -= kiStride;
+	}while(i-->0);
+	iMean = ( 8 + iSum ) >> 4;
+	memset(pPred, iMean, 256);
+}
+
+void WelsI16x16LumaPredDcNA_c(uint8_t *pPred, uint8_t *pRef, const int32_t kiStride)
+{
+	memset(pPred, 0x80, 256);
+}
+
+void WelsInitIntraPredFuncs(SWelsFuncPtrList *pFuncList, const uint32_t kuiCpuFlag )
+{
+	pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] =      WelsI16x16LumaPredV_c;
+	pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] =      WelsI16x16LumaPredH_c;
+	pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] =     WelsI16x16LumaPredDc_c;
+	pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] =      WelsI16x16LumaPredPlane_c;
+	pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_L] =   WelsI16x16LumaPredDcLeft_c;
+	pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_T] =   WelsI16x16LumaPredDcTop_c;
+	pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC_128] = WelsI16x16LumaPredDcNA_c;
+
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_L] = WelsI4x4LumaPredDcLeft_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_T] = WelsI4x4LumaPredDcTop_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC_128] = WelsI4x4LumaPredDcNA_c;
+
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL_TOP] = WelsI4x4LumaPredDDLTop_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_c;
+
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL] = WelsI4x4LumaPredVL_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL_TOP] = WelsI4x4LumaPredVLTop_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR] = WelsI4x4LumaPredVR_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU] = WelsI4x4LumaPredHU_c;
+	pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD] = WelsI4x4LumaPredHD_c;
+
+	pFuncList->pfGetChromaPred[C_PRED_DC] = WelsIChormaPredDc_c;
+	pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChormaPredH_c;
+	pFuncList->pfGetChromaPred[C_PRED_V] = WelsIChormaPredV_c;
+	pFuncList->pfGetChromaPred[C_PRED_P] = WelsIChormaPredPlane_c;
+	pFuncList->pfGetChromaPred[C_PRED_DC_L] = WelsIChormaPredDcLeft_c;
+	pFuncList->pfGetChromaPred[C_PRED_DC_T] = WelsIChormaPredDcTop_c;
+	pFuncList->pfGetChromaPred[C_PRED_DC_128] = WelsIChormaPredDcNA_c;
+#ifdef X86_ASM
+	if( kuiCpuFlag & WELS_CPU_MMXEXT )
+	{
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDR] = WelsI4x4LumaPredDDR_mmx;
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_HD]  = WelsI4x4LumaPredHD_mmx;
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_HU]  = WelsI4x4LumaPredHU_mmx;
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_VR]  = WelsI4x4LumaPredVR_mmx;
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_DDL] = WelsI4x4LumaPredDDL_mmx;
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_VL]  = WelsI4x4LumaPredVL_mmx;
+ 		pFuncList->pfGetChromaPred[C_PRED_H] = WelsIChromaPredH_mmx;
+	}
+	if ( kuiCpuFlag & WELS_CPU_SSE2 )
+	{
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_H] = WelsI4x4LumaPredH_sse2;
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_DC] = WelsI4x4LumaPredDc_sse2;
+		pFuncList->pfGetLumaI4x4Pred[I4_PRED_V] = WelsI4x4LumaPredV_sse2;
+
+		pFuncList->pfGetLumaI16x16Pred[I16_PRED_V] = WelsI16x16LumaPredV_sse2;
+		pFuncList->pfGetLumaI16x16Pred[I16_PRED_H] = WelsI16x16LumaPredH_sse2;
+		pFuncList->pfGetLumaI16x16Pred[I16_PRED_DC] = WelsI16x16LumaPredDc_sse2;
+		pFuncList->pfGetLumaI16x16Pred[I16_PRED_P] = WelsI16x16LumaPredPlane_sse2;
+
+		pFuncList->pfGetChromaPred[C_PRED_DC]	= WelsIChromaPredDc_sse2;
+		pFuncList->pfGetChromaPred[C_PRED_V]	= WelsIChromaPredV_sse2;
+		pFuncList->pfGetChromaPred[C_PRED_P]	= WelsIChromaPredPlane_sse2;
+	}
+#endif
+}
+}
--- /dev/null
+++ b/codec/encoder/core/src/mc.cpp
@@ -1,0 +1,595 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mc.c
+ *
+ * \brief	Interfaces implementation for motion compensation
+ *
+ * \date	03/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include "as264_common.h"
+#include "typedefs.h"
+#include "wels_const.h"
+#include "macros.h"
+#include "mc.h"
+#include "sample.h"
+#include "cpu_core.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+/*------------------weight for chroma fraction pixel interpolation------------------*/
+//kuiA = (8 - dx) * (8 - dy);   
+//kuiB = dx * (8 - dy);   
+//kuiC = (8 - dx) * dy;
+//kuiD = dx * dy
+static const uint8_t g_kuiABCD[8][8][4] = ////g_kuiA[dy][dx], g_kuiB[dy][dx], g_kuiC[dy][dx], g_kuiD[dy][dx]
+{
+	{	
+		{64, 0, 0, 0},{56, 8, 0, 0},{48, 16, 0, 0},{40, 24, 0, 0},
+		{32, 32, 0, 0},{24, 40, 0, 0},{16, 48, 0, 0},{8, 56, 0, 0}
+	},
+	{	
+		{56, 0, 8, 0},{49, 7, 7, 1},{42, 14, 6, 2},{35, 21, 5, 3},
+		{28, 28, 4, 4},{21, 35, 3, 5},{14, 42, 2, 6},{7, 49, 1, 7}
+	},
+	{	
+		{48, 0, 16, 0},{42, 6, 14, 2},{36, 12, 12, 4},{30, 18, 10, 6},
+		{24, 24, 8, 8},{18, 30, 6, 10},{12, 36, 4, 12},{6, 42, 2, 14}
+	},
+	{	
+		{40, 0, 24, 0},{35, 5, 21, 3},{30, 10, 18, 6},{25, 15, 15, 9},
+		{20, 20, 12, 12},{15, 25, 9, 15},{10, 30, 6, 18},{5, 35, 3, 21}
+	},
+	{	
+		{32, 0, 32, 0},{28, 4, 28, 4},{24, 8, 24, 8},{20, 12, 20, 12},
+		{16, 16, 16, 16},{12, 20, 12, 20},{8, 24, 8, 24},{4, 28, 4, 28}
+	},
+	{	
+		{24, 0, 40, 0},{21, 3, 35, 5},{18, 6, 30, 10},{15, 9, 25, 15},
+		{12, 12, 20, 20},{9, 15, 15, 25},{6, 18, 10, 30},{3, 21, 5, 35}
+	},
+	{	
+		{16, 0, 48, 0},{14, 2, 42, 6},{12, 4, 36, 12},{10, 6, 30, 18},
+		{8, 8, 24, 24},{6, 10, 18, 30},{4, 12, 12, 36},{2, 14, 6, 42}
+	},
+	{	
+		{8, 0, 56, 0},{7, 1, 49, 7},{6, 2, 42, 14},{5, 3, 35, 21},
+		{4, 4, 28, 28},{3, 5, 21, 35},{2, 6, 14, 42},{1, 7, 7, 49}
+	}
+};
+typedef int32_t (*VerFilterFunc)(uint8_t* pSrc, const int32_t kiSrcStride);
+typedef int32_t (*HorFilterFunc)(uint8_t* pSrc);
+typedef int32_t (*HorFilterFuncInput16Bits)(int16_t* pSrc);
+
+VerFilterFunc fpVerFilter			= NULL;
+HorFilterFunc fpHorFilter			= NULL;
+HorFilterFuncInput16Bits fpHorFilterInput16Bits = NULL;
+
+typedef void (*WelsMcFunc0) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,int32_t iHeight);
+typedef void (*WelsMcFunc1) (uint8_t* pDst, int32_t iDstStride, uint8_t* psrcA, int32_t iSrcAStride,  uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+WelsMcFunc0 McCopyWidthEq16 = NULL;
+WelsMcFunc0 McCopyWidthEq8 = NULL;
+WelsMcFunc0 McCopyWidthEq4 = NULL;
+WelsMcFunc0 pfMcHorVer02WidthEq16 = NULL;
+WelsMcFunc1 pfPixelAvgWidthEq16  = NULL;
+WelsMcFunc0 pfMcHorVer20WidthEq16 = NULL;
+WelsMcFunc0 pfMcHorVer22WidthEq16 = NULL;
+
+//***************************************************************************//
+//                          C code implementation                            //
+//***************************************************************************//
+static inline void McCopyWidthEq4_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int32_t i;
+	for (i = 0; i < iHeight; i++)
+	{
+		memcpy(pDst, pSrc, 4);	// confirmed_safe_unsafe_usage
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+
+static inline void McCopyWidthEq8_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+
+{
+	int32_t i;
+	for (i = 0; i < iHeight; i++)
+	{
+		memcpy(pDst, pSrc, 8);	// confirmed_safe_unsafe_usage
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+static inline void McCopyWidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int32_t i;
+	for (i = 0; i < iHeight; i++)
+	{
+		memcpy(pDst, pSrc, 16);	// confirmed_safe_unsafe_usage
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+
+//--------------------Luma sample MC------------------//
+static inline int32_t HorFilter_c(uint8_t* pSrc)
+{
+	int32_t iPix05 = pSrc[-2] + pSrc[3];
+	int32_t iPix14 = pSrc[-1] + pSrc[2];
+	int32_t iPix23 = pSrc[ 0] + pSrc[1];
+
+	return (iPix05 - ((iPix14<<2)+iPix14) + (iPix23<<4) + (iPix23<<2));
+}
+
+static inline int32_t HorFilterInput16bit1_c(int16_t* pSrc)
+{
+	int32_t iPix05 = pSrc[-2] + pSrc[3];
+	int32_t iPix14 = pSrc[-1] + pSrc[2];
+	int32_t iPix23 = pSrc[ 0] + pSrc[1];
+	
+	return (iPix05 - ((iPix14<<2)+iPix14) + (iPix23<<4) + (iPix23<<2));
+}	
+static inline int32_t VerFilter_c(uint8_t* pSrc, const int32_t kiSrcStride)
+{
+	const int32_t kiLine1	= kiSrcStride;
+	const int32_t kiLine2	= (kiSrcStride<<1);
+	const int32_t kiLine3 = kiLine1 + kiLine2;
+	const uint32_t kuiPix05= *(pSrc - kiLine2) + *(pSrc + kiLine3);
+	const uint32_t kuiPix14= *(pSrc - kiLine1) + *(pSrc + kiLine2);
+	const uint32_t kuiPix23= *(pSrc        ) + *(pSrc + kiLine1);
+
+	return (kuiPix05 - ((kuiPix14<<2)+kuiPix14) + (kuiPix23<<4) + (kuiPix23<<2));
+}
+
+static inline void PixelAvgWidthEq8_c(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+								uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < 8; j++) 
+		{
+			pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+		}
+		pDst  += iDstStride;
+		pSrcA += iSrcAStride;
+		pSrcB += iSrcBStride;
+	}
+}
+static inline void PixelAvgWidthEq16_c(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
+								 uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < 16; j++) 
+		{
+			pDst[j] = (pSrcA[j] + pSrcB[j] + 1) >> 1;
+		}
+		pDst  += iDstStride;
+		pSrcA += iSrcAStride;
+		pSrcB += iSrcBStride;
+	}
+}
+
+//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+static inline void McHorVer20WidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++) 
+	{
+		for (j = 0; j < 16; j++)
+		{
+			pDst[j] = WELS_CLIP1((fpHorFilter(pSrc+j)+16)>>5);
+		}
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02WidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < 16; j++) 
+		{
+			pDst[j] = WELS_CLIP1((fpVerFilter(pSrc+j, iSrcStride)+16)>>5);
+		}
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22WidthEq16_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	int16_t pTmp[16+5] = {0}; //16
+	int32_t i, j, k;
+
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < 16 + 5; j++)
+		{
+			pTmp[j] = fpVerFilter(pSrc-2+j, iSrcStride);
+		}
+		for (k = 0; k < 16; k++)
+		{
+			pDst[k] = WELS_CLIP1((fpHorFilterInput16Bits(&pTmp[2+k])+512)>>10);
+		}		
+		pSrc += iSrcStride;
+		pDst += iDstStride;
+	}
+}
+
+/////////////////////luma MC////////////////////////// 
+
+static inline void McHorVer01WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 256, 16)
+
+	pfMcHorVer02WidthEq16(pSrc, iSrcStride, pTmp, 16, iHeight);	
+	pfPixelAvgWidthEq16(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16,iHeight);
+}
+static inline void McHorVer03WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 256, 16)
+	
+	pfMcHorVer02WidthEq16(pSrc, iSrcStride, pTmp, 16, iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pSrc+iSrcStride, iSrcStride, pTmp, 16,iHeight);
+}
+static inline void McHorVer10WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 256, 16)
+
+	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16, iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pSrc, iSrcStride, pTmp, 16,iHeight);
+}
+static inline void McHorVer11WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)	
+	
+	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16, iHeight);
+	pfMcHorVer02WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
+}
+static inline void McHorVer12WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)	
+
+	pfMcHorVer02WidthEq16(pSrc, iSrcStride, pTmp, 16,iHeight);
+	pfMcHorVer22WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
+}
+static inline void McHorVer13WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)	
+
+	pfMcHorVer20WidthEq16(pSrc+iSrcStride, iSrcStride, pTmp, 16,iHeight);
+	pfMcHorVer02WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
+}
+static inline void McHorVer21WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)	
+
+	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16,iHeight);
+	pfMcHorVer22WidthEq16(pSrc, iSrcStride, &pTmp[256], 16, iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride,pTmp, 16, &pTmp[256], 16,iHeight);
+}
+static inline void McHorVer23WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+
+	pfMcHorVer20WidthEq16(pSrc+iSrcStride, iSrcStride, pTmp, 16,iHeight);
+	pfMcHorVer22WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
+}
+static inline void McHorVer30WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 256, 16)
+
+	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16,iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pSrc+1, iSrcStride, pTmp, 16,iHeight);
+}
+static inline void McHorVer31WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+
+	pfMcHorVer20WidthEq16(pSrc, iSrcStride, pTmp, 16,iHeight);
+	pfMcHorVer02WidthEq16(pSrc+1, iSrcStride, &pTmp[256], 16,iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
+}
+static inline void McHorVer32WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+
+	pfMcHorVer02WidthEq16(pSrc+1, iSrcStride, pTmp, 16,iHeight);
+	pfMcHorVer22WidthEq16(pSrc, iSrcStride, &pTmp[256], 16,iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
+}
+static inline void McHorVer33WidthEq16(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight)
+{
+	ENFORCE_STACK_ALIGN_1D(uint8_t, pTmp, 512, 16)
+
+	pfMcHorVer20WidthEq16(pSrc+iSrcStride, iSrcStride, pTmp, 16,iHeight);
+	pfMcHorVer02WidthEq16(pSrc+1, iSrcStride, &pTmp[256], 16,iHeight);
+	pfPixelAvgWidthEq16(pDst, iDstStride, pTmp, 16, &pTmp[256], 16,iHeight);
+}
+
+static inline void McHorVer20_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++) 
+	{
+		for (j = 0; j < iWidth; j++)
+		{
+			pDst[j] = WELS_CLIP1((fpHorFilter(pSrc+j)+16)>>5);
+		}
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+//vertical filter to gain half sample, that is (0, 2) location in quarter sample
+static inline void McHorVer02_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	int32_t i, j;
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < iWidth; j++) 
+		{
+			pDst[j] = WELS_CLIP1((fpVerFilter(pSrc+j, iSrcStride)+16)>>5);
+		}
+		pDst += iDstStride;
+		pSrc += iSrcStride;
+	}
+}
+//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+static inline void McHorVer22_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	int16_t pTmp[17+5] = {0}; //w+1
+	int32_t i, j, k;
+
+	for (i = 0; i < iHeight; i++)
+	{
+		for (j = 0; j < iWidth + 5; j++)
+		{
+			pTmp[j] = fpVerFilter(pSrc-2+j, iSrcStride);
+		}
+		for (k = 0; k < iWidth; k++)
+		{
+			pDst[k] = WELS_CLIP1((fpHorFilterInput16Bits(&pTmp[2+k])+512)>>10);
+		}		
+		pSrc += iSrcStride;
+		pDst += iDstStride;
+	}
+}
+static inline void McCopy(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight)
+{
+	int32_t i;
+	if (iWidth == 16 && McCopyWidthEq16!= NULL)
+		McCopyWidthEq16(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth == 8 &&McCopyWidthEq8!= NULL)
+		McCopyWidthEq8(pSrc,iSrcStride,pDst,iDstStride,iHeight);
+	else if(iWidth == 4 &&McCopyWidthEq4!= NULL)
+		McCopyWidthEq4(pSrc,iSrcStride,pDst,iDstStride,iHeight);	
+	else
+	{
+		for (i = 0; i < iHeight; i++)
+		{
+			memcpy(pDst, pSrc, iWidth);	// confirmed_safe_unsafe_usage
+			pDst += iDstStride;
+			pSrc += iSrcStride;
+		}				
+	}
+}
+
+void McChroma_c(uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+			        SMVUnitXY mv, int32_t iWidth, int32_t iHeight)
+					//pSrc has been added the offset of mv
+{
+	const int32_t kiDx = mv.iMvX & 0x07;
+	const int32_t kiDy = mv.iMvY & 0x07;
+
+	if ( 0 == kiDx && 0 == kiDy )
+	{
+		McCopy(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+	}
+	else
+	{
+		const int32_t kiDA = g_kuiABCD[kiDy][kiDx][0];
+		const int32_t kiDB = g_kuiABCD[kiDy][kiDx][1];
+		const int32_t kiDC = g_kuiABCD[kiDy][kiDx][2];
+		const int32_t kiDD = g_kuiABCD[kiDy][kiDx][3];
+
+		int32_t i, j;
+
+		uint8_t* pSrcNext = pSrc + iSrcStride;
+
+		for (i = 0; i < iHeight; i++)
+		{
+			for (j = 0; j < iWidth; j++)
+			{
+				pDst[j] = (kiDA * pSrc[j] + kiDB * pSrc[j+1] + kiDC * pSrcNext[j] + kiDD * pSrcNext[j+1] + 32) >> 6;
+			}
+			pDst += iDstStride;
+			pSrc = pSrcNext;
+			pSrcNext += iSrcStride;
+		}
+	}	
+}
+//***************************************************************************//
+//                       MMXEXT and SSE2 implementation                      //
+//***************************************************************************//
+#if defined(X86_ASM)
+
+static inline void McHorVer22WidthEq8_sse2 ( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
+{	
+	ENFORCE_STACK_ALIGN_2D(int16_t, pTap, 21, 8, 16)
+	McHorVer22Width8HorFirst_sse2(pSrc-2, iSrcStride, (uint8_t *)pTap,16,iHeight+5);
+	McHorVer22VerLastAlign_sse2((uint8_t *)pTap,16, pDst, iDstStride, 8, iHeight);
+}
+
+//2010.2.5
+
+static inline void McHorVer02WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *PDst, int32_t iDstStride, int32_t iHeight )
+{
+	McHorVer02WidthEq8_sse2( pSrc,     iSrcStride, PDst,     iDstStride, iHeight );
+    McHorVer02WidthEq8_sse2( &pSrc[8], iSrcStride, &PDst[8], iDstStride, iHeight );
+}
+static inline void McHorVer22WidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
+{
+    McHorVer22WidthEq8_sse2( pSrc,     iSrcStride, pDst,     iDstStride, iHeight );
+    McHorVer22WidthEq8_sse2( &pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight );
+}
+void McHorVer22_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iWidth,int32_t iHeight )
+{
+	ENFORCE_STACK_ALIGN_2D(int16_t, pTap, 22, 24, 16)
+	int32_t tmp1 = 2*(iWidth-8);
+	McHorVer22HorFirst_sse2(pSrc-2, iSrcStride, (uint8_t *)pTap,48,iWidth,iHeight+5);
+	McHorVer22VerLastAlign_sse2((uint8_t *)pTap,  48, pDst, iDstStride, iWidth-1, iHeight);
+	McHorVer22VerLastUnAlign_sse2((uint8_t *)pTap+tmp1,  48, pDst+iWidth-8, iDstStride, 8, iHeight);
+}
+
+typedef void (*McChromaWidthEqx)(uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, const uint8_t *pABCD,int32_t iHeigh);
+void McChroma_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+					SMVUnitXY sMv, int32_t iWidth, int32_t iHeight )
+{
+	const int32_t kiD8x = sMv.iMvX&0x07;
+	const int32_t kiD8y = sMv.iMvY&0x07;
+	static const McChromaWidthEqx kpfFuncs[2] =
+	{
+		McChromaWidthEq4_mmx,
+		McChromaWidthEq8_sse2
+	};
+
+	if (0 == kiD8x && 0 == kiD8y)
+	{
+		McCopy(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+	}
+	else
+	{
+		kpfFuncs[(iWidth>>3)](pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+	}
+}
+
+void McChroma_ssse3( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
+					 SMVUnitXY sMv, int32_t iWidth, int32_t iHeight )
+{
+	const int32_t kiD8x = sMv.iMvX&0x07;
+	const int32_t kiD8y = sMv.iMvY&0x07;
+
+	static const McChromaWidthEqx kpfFuncs[2] = 
+	{
+		McChromaWidthEq4_mmx,
+		McChromaWidthEq8_ssse3
+	};
+	if (0 == kiD8x && 0 == kiD8y)
+	{
+		McCopy(pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
+	}
+	else
+	{
+		kpfFuncs[(iWidth>>3)](pSrc, iSrcStride, pDst, iDstStride, g_kuiABCD[kiD8y][kiD8x], iHeight);
+	}
+	
+}
+
+#endif //X86_ASM
+typedef void (*PixelAvgFunc) ( uint8_t *, int32_t, uint8_t *, int32_t, uint8_t *, int32_t, int32_t );
+void WelsInitMcFuncs( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag )
+{
+	static PixelAvgFunc pfPixAvgFunc[2] ={PixelAvgWidthEq8_c,PixelAvgWidthEq16_c};
+
+	static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16[16] =  //[y*4+x]   
+	{
+		McCopyWidthEq16_c,  McHorVer10WidthEq16, McHorVer20WidthEq16_c,     McHorVer30WidthEq16,   
+		McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16, 
+		McHorVer02WidthEq16_c,     McHorVer12WidthEq16, McHorVer22WidthEq16_c,    McHorVer32WidthEq16,     
+		McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
+	};
+#if defined (X86_ASM)
+	static PWelsLumaQuarpelMcFunc pWelsMcFuncWidthEq16_sse2[16] = 
+	{
+		McCopyWidthEq16_sse2,  McHorVer10WidthEq16, McHorVer20WidthEq16_sse2,     McHorVer30WidthEq16,   
+		McHorVer01WidthEq16, McHorVer11WidthEq16, McHorVer21WidthEq16, McHorVer31WidthEq16, 
+		McHorVer02WidthEq16_sse2,     McHorVer12WidthEq16, McHorVer22WidthEq16_sse2,    McHorVer32WidthEq16,     
+		McHorVer03WidthEq16, McHorVer13WidthEq16, McHorVer23WidthEq16, McHorVer33WidthEq16
+	};
+#endif
+
+	pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_c;
+	pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_c;
+	pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_c;
+	pFuncList->sMcFuncs.pfSampleAveraging = pfPixAvgFunc;
+	pFuncList->sMcFuncs.pfChromaMc	= McChroma_c;
+	fpVerFilter				= VerFilter_c;	
+	fpHorFilter				= HorFilter_c;
+	fpHorFilterInput16Bits			= HorFilterInput16bit1_c;
+	McCopyWidthEq4 = McCopyWidthEq4_c;
+	McCopyWidthEq8 = McCopyWidthEq8_c;
+	McCopyWidthEq16 = McCopyWidthEq16_c;
+	pfPixelAvgWidthEq16 = PixelAvgWidthEq16_c;
+	pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_c;
+	pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_c;
+	pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_c;
+	pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16;
+#if defined (X86_ASM)
+	if ( uiCpuFlag & WELS_CPU_SSE2 )
+	{
+		pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_sse2;
+		pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_sse2;
+		pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_sse2;
+		pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
+		pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
+		pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
+		McCopyWidthEq4 = McCopyWidthEq4_mmx;
+		McCopyWidthEq8 = McCopyWidthEq8_mmx;
+		McCopyWidthEq16 = McCopyWidthEq16_sse2;
+		pfPixelAvgWidthEq16 = PixelAvgWidthEq16_sse2;
+		pfMcHorVer02WidthEq16 = McHorVer02WidthEq16_sse2;
+		pfMcHorVer20WidthEq16 = McHorVer20WidthEq16_sse2;
+		pfMcHorVer22WidthEq16 = McHorVer22WidthEq16_sse2;		
+		pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16_sse2;
+	}
+
+	if ( uiCpuFlag & WELS_CPU_SSSE3 )
+	{
+		pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3;
+		pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_ssse3;
+	}
+
+#endif //(X86_ASM)
+}
+}
--- /dev/null
+++ b/codec/encoder/core/src/md.cpp
@@ -1,0 +1,1034 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	md.c
+ *
+ * \brief	mode decision
+ *
+ * \date	2009.05.14 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include "ls_defines.h"
+#include "encoder_context.h"
+#include "svc_enc_slice_segment.h"
+#include "md.h"
+#include "mc.h"
+#include "mv_pred.h"
+#include "cpu_core.h"
+#include "svc_enc_golomb.h"
+#include "sample.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+#define INTRA_VARIANCE_SAD_THRESHOLD 150
+#define INTER_VARIANCE_SAD_THRESHOLD 20
+
+//fill cache of neighbor MB, containing pNonZeroCount, sample_avail, pIntra4x4PredMode
+void FillNeighborCacheIntra(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth)
+{
+	uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
+	uint32_t uiNeighborIntra = 0;
+
+	if (uiNeighborAvail & LEFT_MB_POS) //LEFT MB
+	{
+		int8_t* pLeftMbNonZeroCount = pCurMb->pNonZeroCount - MB_LUMA_CHROMA_BLOCK4x4_NUM;
+		pMbCache->iNonZeroCoeffCount[8] = pLeftMbNonZeroCount[ 3];
+		pMbCache->iNonZeroCoeffCount[16] = pLeftMbNonZeroCount[ 7];
+		pMbCache->iNonZeroCoeffCount[24] = pLeftMbNonZeroCount[11];
+		pMbCache->iNonZeroCoeffCount[32] = pLeftMbNonZeroCount[15];
+
+		pMbCache->iNonZeroCoeffCount[ 13] = pLeftMbNonZeroCount[17]; 
+		pMbCache->iNonZeroCoeffCount[21] = pLeftMbNonZeroCount[21];
+		pMbCache->iNonZeroCoeffCount[37] = pLeftMbNonZeroCount[19]; 
+		pMbCache->iNonZeroCoeffCount[45] = pLeftMbNonZeroCount[23];
+
+        uiNeighborIntra |= LEFT_MB_POS;
+
+		if ( IS_INTRA4x4((pCurMb-1)->uiMbType ) ) 
+		{
+			int8_t* pLeftMbIntra4x4PredMode = pCurMb->pIntra4x4PredMode - INTRA_4x4_MODE_NUM;
+			pMbCache->iIntraPredMode[8] = pLeftMbIntra4x4PredMode[4];
+			pMbCache->iIntraPredMode[16] = pLeftMbIntra4x4PredMode[5];
+			pMbCache->iIntraPredMode[24] = pLeftMbIntra4x4PredMode[6];
+			pMbCache->iIntraPredMode[32] = pLeftMbIntra4x4PredMode[3];
+		}
+		else// if ( 0 == constrained_intra_pred_flag || IS_INTRA16x16((pCurMb-1)->uiMbType )) 
+		{
+			pMbCache->iIntraPredMode[8] = 
+			pMbCache->iIntraPredMode[16] = 
+			pMbCache->iIntraPredMode[24] = 
+			pMbCache->iIntraPredMode[32] = 2; //DC		
+		}
+	}
+	else
+	{
+		pMbCache->iNonZeroCoeffCount[ 8] = 
+		pMbCache->iNonZeroCoeffCount[16] = 
+		pMbCache->iNonZeroCoeffCount[24] =
+		pMbCache->iNonZeroCoeffCount[32] = -1;//unavailable
+		pMbCache->iNonZeroCoeffCount[13] = 
+		pMbCache->iNonZeroCoeffCount[21] =
+		pMbCache->iNonZeroCoeffCount[37] =
+		pMbCache->iNonZeroCoeffCount[45] = -1;//unavailable
+
+		pMbCache->iIntraPredMode[8] = 
+		pMbCache->iIntraPredMode[16] = 
+		pMbCache->iIntraPredMode[24] = 
+		pMbCache->iIntraPredMode[32] = -1;//unavailable
+	}
+
+	if (uiNeighborAvail & TOP_MB_POS)//TOP MB
+	{
+		SMB* pTopMb = pCurMb - iMbWidth;
+		ST32(&pMbCache->iNonZeroCoeffCount[1], LD32(&pTopMb->pNonZeroCount[12]));
+
+		ST16(&pMbCache->iNonZeroCoeffCount[6], LD16(&pTopMb->pNonZeroCount[20]));
+		ST16(&pMbCache->iNonZeroCoeffCount[30], LD16(&pTopMb->pNonZeroCount[22]));
+		
+        uiNeighborIntra |= TOP_MB_POS;
+
+		if ( IS_INTRA4x4( pTopMb->uiMbType ) ) 
+		{
+			ST32(pMbCache->iIntraPredMode+1, LD32(&pTopMb->pIntra4x4PredMode[0]));
+		}
+		else// if ( 0 == constrained_intra_pred_flag || IS_INTRA16x16( pTopMb->uiMbType )) 
+		{
+			const uint32_t kuiDc32 = 0x02020202;
+			ST32( pMbCache->iIntraPredMode+1 , kuiDc32 );
+		}
+	}
+	else
+	{
+		const uint32_t kuiUnavail32 = 0xffffffff;
+		ST32( pMbCache->iIntraPredMode+1 , kuiUnavail32 );
+		ST32( &pMbCache->iNonZeroCoeffCount[1], kuiUnavail32 );
+
+		ST16( &pMbCache->iNonZeroCoeffCount[6], 0xffff );
+		ST16( &pMbCache->iNonZeroCoeffCount[30], 0xffff );
+	}
+
+	if (uiNeighborAvail & TOPLEFT_MB_POS)
+	{
+        uiNeighborIntra |= 0x04;
+	}
+
+	
+	if (uiNeighborAvail & TOPRIGHT_MB_POS)
+    {
+        uiNeighborIntra |= 0x08;
+	}
+	pMbCache->uiNeighborIntra = uiNeighborIntra;
+}
+//fill cache of neighbor MB, containing motion_vector and uiRefIndex
+void FillNeighborCacheInterWithoutBGD(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag)
+{	
+	uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
+	SMB* pLeftMb = pCurMb -1 ;
+	SMB* pTopMb = pCurMb -iMbWidth;
+	SMB* pLeftTopMb = pCurMb - iMbWidth - 1 ;
+	SMB* iRightTopMb = pCurMb -iMbWidth + 1 ;
+	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
+	if( (uiNeighborAvail & LEFT_MB_POS) && IS_SVC_INTER(pLeftMb->uiMbType) )	
+	{
+		pMvComp->sMotionVectorCache[ 6] = pLeftMb->sMv[ 3];
+		pMvComp->sMotionVectorCache[12] = pLeftMb->sMv[ 7];
+		pMvComp->sMotionVectorCache[18] = pLeftMb->sMv[11];
+		pMvComp->sMotionVectorCache[24] = pLeftMb->sMv[15];
+		pMvComp->iRefIndexCache[ 6] = pLeftMb->pRefIndex[1];
+		pMvComp->iRefIndexCache[12] = pLeftMb->pRefIndex[1];
+		pMvComp->iRefIndexCache[18] = pLeftMb->pRefIndex[3];			
+		pMvComp->iRefIndexCache[24] = pLeftMb->pRefIndex[3];			
+		pMbCache->iSadCost[3] = pLeftMb->pSadCost[0];
+
+		if ( pLeftMb->uiMbType == MB_TYPE_SKIP )
+		{
+			pMbCache->bMbTypeSkip[3] = 1;
+			pMbCache->iSadCostSkip[3] = pMbCache->pEncSad[-1];
+		}
+		else
+		{
+			pMbCache->bMbTypeSkip[3] = 0;
+			pMbCache->iSadCostSkip[3] = 0;
+		}
+	}
+	else //avail or non-inter
+	{
+		ST32(&pMvComp->sMotionVectorCache[ 6], 0);
+		ST32(&pMvComp->sMotionVectorCache[12], 0);
+		ST32(&pMvComp->sMotionVectorCache[18], 0);
+		ST32(&pMvComp->sMotionVectorCache[24], 0);
+		pMvComp->iRefIndexCache[ 6] =
+			pMvComp->iRefIndexCache[12] =
+			pMvComp->iRefIndexCache[18] =		
+			pMvComp->iRefIndexCache[24] = (uiNeighborAvail & LEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;			
+		pMbCache->iSadCost[3] = 0;
+		pMbCache->bMbTypeSkip[3] = 0;
+		pMbCache->iSadCostSkip[3] = 0;
+	}
+
+	if ( (uiNeighborAvail & TOP_MB_POS) && IS_SVC_INTER(pTopMb->uiMbType) ) //TOP MB	
+	{
+		ST64(&pMvComp->sMotionVectorCache[1], LD64(&pTopMb->sMv[12]));
+		ST64(&pMvComp->sMotionVectorCache[3], LD64(&pTopMb->sMv[14]));
+		pMvComp->iRefIndexCache[1] = pTopMb->pRefIndex[2];
+		pMvComp->iRefIndexCache[2] = pTopMb->pRefIndex[2];
+		pMvComp->iRefIndexCache[3] = pTopMb->pRefIndex[3];
+		pMvComp->iRefIndexCache[4] = pTopMb->pRefIndex[3];
+		pMbCache->iSadCost[1] = pTopMb->pSadCost[0];	
+
+		if ( pTopMb->uiMbType == MB_TYPE_SKIP )
+		{
+			pMbCache->bMbTypeSkip[1] = 1;
+			pMbCache->iSadCostSkip[1] = pMbCache->pEncSad[-iMbWidth];
+		}
+		else
+		{
+			pMbCache->bMbTypeSkip[1] = 0;
+			pMbCache->iSadCostSkip[1] = 0;
+		}			
+	}
+	else //unavail
+	{
+		ST64(&pMvComp->sMotionVectorCache[1], 0);
+		ST64(&pMvComp->sMotionVectorCache[3], 0);
+		pMvComp->iRefIndexCache[1] = 
+			pMvComp->iRefIndexCache[2] = 
+			pMvComp->iRefIndexCache[3] = 
+			pMvComp->iRefIndexCache[4] = (uiNeighborAvail & TOP_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+		pMbCache->iSadCost[1] = 0; 
+
+		pMbCache->bMbTypeSkip[1] = 0;
+		pMbCache->iSadCostSkip[1] = 0;	
+	}
+
+	if ( (uiNeighborAvail & TOPLEFT_MB_POS) && IS_SVC_INTER(pLeftTopMb->uiMbType) ) //LEFT_TOP MB	
+	{
+		pMvComp->sMotionVectorCache[0] = pLeftTopMb->sMv[15];
+		pMvComp->iRefIndexCache[0] = pLeftTopMb->pRefIndex[3];		
+		pMbCache->iSadCost[0] = pLeftTopMb->pSadCost[0];
+
+		if ( pLeftTopMb->uiMbType == MB_TYPE_SKIP )
+		{
+			pMbCache->bMbTypeSkip[0] = 1;
+			pMbCache->iSadCostSkip[0] = pMbCache->pEncSad[-iMbWidth-1];
+		}
+		else
+		{
+			pMbCache->bMbTypeSkip[0] = 0;
+			pMbCache->iSadCostSkip[0] = 0;
+		}
+	}
+	else //unavail
+	{
+		ST32(&pMvComp->sMotionVectorCache[0], 0);
+		pMvComp->iRefIndexCache[0] = (uiNeighborAvail & TOPLEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+		pMbCache->iSadCost[0] = 0;
+		pMbCache->bMbTypeSkip[0] = 0;
+		pMbCache->iSadCostSkip[0] = 0;
+	}
+
+	if ((uiNeighborAvail & TOPRIGHT_MB_POS) && IS_SVC_INTER(iRightTopMb->uiMbType) ) //RIGHT_TOP MB	
+	{
+		pMvComp->sMotionVectorCache[5] = iRightTopMb->sMv[12];
+		pMvComp->iRefIndexCache[5] = iRightTopMb->pRefIndex[2];
+		pMbCache->iSadCost[2] = iRightTopMb->pSadCost[0];	
+
+		if ( iRightTopMb->uiMbType == MB_TYPE_SKIP )
+		{
+			pMbCache->bMbTypeSkip[2] = 1;
+			pMbCache->iSadCostSkip[2] = pMbCache->pEncSad[-iMbWidth+1];
+		}
+		else
+		{
+			pMbCache->bMbTypeSkip[2] = 0;
+			pMbCache->iSadCostSkip[2] = 0;
+		}		
+	}
+	else //unavail
+	{
+		ST32(&pMvComp->sMotionVectorCache[5], 0);
+		pMvComp->iRefIndexCache[5] = (uiNeighborAvail & TOPRIGHT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+		pMbCache->iSadCost[2] = 0;
+		pMbCache->bMbTypeSkip[2] = 0;
+		pMbCache->iSadCostSkip[2] = 0;
+	}
+
+	//right-top 4*4 pBlock unavailable
+	ST32(&pMvComp->sMotionVectorCache[ 9], 0);
+	ST32(&pMvComp->sMotionVectorCache[21], 0);
+	ST32(&pMvComp->sMotionVectorCache[11], 0);
+	ST32(&pMvComp->sMotionVectorCache[17], 0);
+	ST32(&pMvComp->sMotionVectorCache[23], 0);
+	pMvComp->iRefIndexCache[ 9] = 
+	pMvComp->iRefIndexCache[11] =
+	pMvComp->iRefIndexCache[17] =
+	pMvComp->iRefIndexCache[21] = 
+	pMvComp->iRefIndexCache[23] = REF_NOT_AVAIL;
+}
+
+void FillNeighborCacheInterWithBGD(SMbCache* pMbCache, SMB* pCurMb, int32_t iMbWidth, int8_t *pVaaBgMbFlag)
+{	
+	uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
+	SMB* pLeftMb = pCurMb -1 ;
+	SMB* pTopMb = pCurMb -iMbWidth;
+	SMB* pLeftTopMb = pCurMb - iMbWidth - 1 ;
+	SMB* iRightTopMb = pCurMb -iMbWidth + 1 ;
+	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
+
+	if( (uiNeighborAvail & LEFT_MB_POS) && IS_SVC_INTER(pLeftMb->uiMbType) )	
+	{
+		pMvComp->sMotionVectorCache[ 6] = pLeftMb->sMv[ 3];
+		pMvComp->sMotionVectorCache[12] = pLeftMb->sMv[ 7];
+		pMvComp->sMotionVectorCache[18] = pLeftMb->sMv[11];
+		pMvComp->sMotionVectorCache[24] = pLeftMb->sMv[15];
+		pMvComp->iRefIndexCache[ 6] = pLeftMb->pRefIndex[1];
+		pMvComp->iRefIndexCache[12] = pLeftMb->pRefIndex[1];
+		pMvComp->iRefIndexCache[18] = pLeftMb->pRefIndex[3];			
+		pMvComp->iRefIndexCache[24] = pLeftMb->pRefIndex[3];			
+		pMbCache->iSadCost[3] = pLeftMb->pSadCost[0];
+
+		if ( pLeftMb->uiMbType == MB_TYPE_SKIP && pVaaBgMbFlag[-1] == 0)
+		{
+			pMbCache->bMbTypeSkip[3] = 1;
+			pMbCache->iSadCostSkip[3] = pMbCache->pEncSad[-1];
+		}
+		else
+		{
+			pMbCache->bMbTypeSkip[3] = 0;
+			pMbCache->iSadCostSkip[3] = 0;
+		}
+	}
+	else //avail or non-inter
+	{
+		ST32(&pMvComp->sMotionVectorCache[ 6], 0);
+		ST32(&pMvComp->sMotionVectorCache[12], 0);
+		ST32(&pMvComp->sMotionVectorCache[18], 0);
+		ST32(&pMvComp->sMotionVectorCache[24], 0);
+		pMvComp->iRefIndexCache[ 6] =
+		pMvComp->iRefIndexCache[12] =
+		pMvComp->iRefIndexCache[18] =		
+		pMvComp->iRefIndexCache[24] = (uiNeighborAvail & LEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+		pMbCache->iSadCost[3] = 0;
+		pMbCache->bMbTypeSkip[3] = 0;
+		pMbCache->iSadCostSkip[3] = 0;
+	}
+
+	if ( (uiNeighborAvail & TOP_MB_POS) && IS_SVC_INTER(pTopMb->uiMbType) ) //TOP MB	
+	{
+		ST64(&pMvComp->sMotionVectorCache[1], LD64(&pTopMb->sMv[12]));
+		ST64(&pMvComp->sMotionVectorCache[3], LD64(&pTopMb->sMv[14]));
+		pMvComp->iRefIndexCache[1] = pTopMb->pRefIndex[2];
+		pMvComp->iRefIndexCache[2] = pTopMb->pRefIndex[2];
+		pMvComp->iRefIndexCache[3] = pTopMb->pRefIndex[3];
+		pMvComp->iRefIndexCache[4] = pTopMb->pRefIndex[3];
+		pMbCache->iSadCost[1] = pTopMb->pSadCost[0];	
+		if ( pTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth] == 0 )
+		{
+			pMbCache->bMbTypeSkip[1] = 1;
+			pMbCache->iSadCostSkip[1] = pMbCache->pEncSad[-iMbWidth];
+		}
+		else
+		{
+			pMbCache->bMbTypeSkip[1] = 0;
+			pMbCache->iSadCostSkip[1] = 0;
+		}				
+	}
+	else //unavail
+	{
+		ST64(&pMvComp->sMotionVectorCache[1], 0);
+		ST64(&pMvComp->sMotionVectorCache[3], 0);
+		pMvComp->iRefIndexCache[1] = 
+			pMvComp->iRefIndexCache[2] = 
+			pMvComp->iRefIndexCache[3] = 
+			pMvComp->iRefIndexCache[4] = (uiNeighborAvail & TOP_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+		pMbCache->iSadCost[1] = 0; 
+		pMbCache->bMbTypeSkip[1] = 0;
+		pMbCache->iSadCostSkip[1] = 0;	
+	}
+
+
+	if ( (uiNeighborAvail & TOPLEFT_MB_POS) && IS_SVC_INTER(pLeftTopMb->uiMbType) ) //LEFT_TOP MB	
+	{
+		pMvComp->sMotionVectorCache[0] = pLeftTopMb->sMv[15];
+		pMvComp->iRefIndexCache[0] = pLeftTopMb->pRefIndex[3];		
+		pMbCache->iSadCost[0] = pLeftTopMb->pSadCost[0];
+
+		if ( pLeftTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth-1] == 0 )
+		{
+			pMbCache->bMbTypeSkip[0] = 1;
+			pMbCache->iSadCostSkip[0] = pMbCache->pEncSad[-iMbWidth-1];
+		}
+		else
+		{
+			pMbCache->bMbTypeSkip[0] = 0;
+			pMbCache->iSadCostSkip[0] = 0;
+		}
+	}
+	else //unavail
+	{
+		ST32(&pMvComp->sMotionVectorCache[0], 0);
+		pMvComp->iRefIndexCache[0] = (uiNeighborAvail & TOPLEFT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+		pMbCache->iSadCost[0] = 0;
+		pMbCache->bMbTypeSkip[0] = 0;
+		pMbCache->iSadCostSkip[0] = 0;
+	}
+
+	if ((uiNeighborAvail & TOPRIGHT_MB_POS) && IS_SVC_INTER(iRightTopMb->uiMbType) ) //RIGHT_TOP MB	
+	{
+		pMvComp->sMotionVectorCache[5] = iRightTopMb->sMv[12];
+		pMvComp->iRefIndexCache[5] = iRightTopMb->pRefIndex[2];
+		pMbCache->iSadCost[2] = iRightTopMb->pSadCost[0];	
+
+		if ( iRightTopMb->uiMbType == MB_TYPE_SKIP  && pVaaBgMbFlag[-iMbWidth+1] == 0 )
+		{
+			pMbCache->bMbTypeSkip[2] = 1;
+			pMbCache->iSadCostSkip[2] = pMbCache->pEncSad[-iMbWidth+1];
+		}
+		else
+		{
+			pMbCache->bMbTypeSkip[2] = 0;
+			pMbCache->iSadCostSkip[2] = 0;
+		}		
+	}
+	else //unavail
+	{
+		ST32(&pMvComp->sMotionVectorCache[5], 0);
+		pMvComp->iRefIndexCache[5] = (uiNeighborAvail & TOPRIGHT_MB_POS) ? REF_NOT_IN_LIST : REF_NOT_AVAIL;
+		pMbCache->iSadCost[2] = 0;
+		pMbCache->bMbTypeSkip[2] = 0;
+		pMbCache->iSadCostSkip[2] = 0;	
+	}
+
+	//right-top 4*4 pBlock unavailable
+	ST32(&pMvComp->sMotionVectorCache[ 9], 0);
+	ST32(&pMvComp->sMotionVectorCache[21], 0);
+	ST32(&pMvComp->sMotionVectorCache[11], 0);
+	ST32(&pMvComp->sMotionVectorCache[17], 0);
+	ST32(&pMvComp->sMotionVectorCache[23], 0);
+	pMvComp->iRefIndexCache[ 9] = 
+		pMvComp->iRefIndexCache[11] =
+		pMvComp->iRefIndexCache[17] =
+		pMvComp->iRefIndexCache[21] = 
+		pMvComp->iRefIndexCache[23] = REF_NOT_AVAIL;
+}
+
+void InitFillNeighborCacheInterFunc( SWelsFuncPtrList *pFuncList, const int32_t kiFlag )
+{
+	pFuncList->pfFillInterNeighborCache = kiFlag ? FillNeighborCacheInterWithBGD : FillNeighborCacheInterWithoutBGD;
+}
+
+void UpdateMbMv_c( SMVUnitXY *pMvBuffer, const SMVUnitXY ksMv )
+{
+	int32_t k = 0;
+	for (; k < MB_BLOCK4x4_NUM; k += 4)
+	{
+		pMvBuffer[k  ] = 
+		pMvBuffer[k+1] =
+		pMvBuffer[k+2] = 
+		pMvBuffer[k+3] = ksMv;
+	}
+}
+
+
+uint8_t MdInterAnalysisVaaInfo_c( int32_t *pSad8x8 )
+{	
+	int32_t iSadBlock[4], iAverageSadBlock[4];
+	int32_t iAverageSad, iVarianceSad;
+	
+	iSadBlock[0] = pSad8x8[0];
+	iAverageSad = iSadBlock[0];
+
+	iSadBlock[1] = pSad8x8[1];
+	iAverageSad += iSadBlock[1];
+
+	iSadBlock[2] = pSad8x8[2];
+	iAverageSad += iSadBlock[2];
+
+	iSadBlock[3] = pSad8x8[3];
+	iAverageSad += iSadBlock[3];
+
+	iAverageSad = iAverageSad >> 2;
+
+	iAverageSadBlock[0] = (iSadBlock[0] >> 6) - (iAverageSad >> 6);
+	iVarianceSad = iAverageSadBlock[0] * iAverageSadBlock[0];
+	
+	iAverageSadBlock[1] = (iSadBlock[1] >> 6) - (iAverageSad >> 6);
+	iVarianceSad += iAverageSadBlock[1] * iAverageSadBlock[1];
+
+	iAverageSadBlock[2] = (iSadBlock[2] >> 6) - (iAverageSad >> 6);
+	iVarianceSad += iAverageSadBlock[2] * iAverageSadBlock[2];
+
+	iAverageSadBlock[3] = (iSadBlock[3] >> 6) - (iAverageSad >> 6);
+	iVarianceSad += iAverageSadBlock[3] * iAverageSadBlock[3];
+
+	if ( iVarianceSad < INTER_VARIANCE_SAD_THRESHOLD )
+	{		
+		return 15;
+	}
+
+	uint8_t uiMbSign = 0;
+	if (iSadBlock[0] > iAverageSad) 
+		uiMbSign |= 0x08;
+	if (iSadBlock[1] > iAverageSad) 
+		uiMbSign |= 0x04;
+	if (iSadBlock[2] > iAverageSad) 
+		uiMbSign |= 0x02;
+	if (iSadBlock[3] > iAverageSad) 
+		uiMbSign |= 0x01;
+	return ( uiMbSign );
+}
+
+static inline int32_t AnalysisVaaInfoIntra_c( uint8_t *pDataY, const int32_t kiLineSize )
+{
+	ENFORCE_STACK_ALIGN_1D(uint16_t, uiAvgBlock, 16, 16)
+	uint16_t *pBlock = &uiAvgBlock[0];
+	uint8_t *pEncData	= pDataY;
+	const int32_t kiLineSize2	= kiLineSize << 1;
+	const int32_t kiLineSize3	= kiLineSize + kiLineSize2;
+	const int32_t kiLineSize4	= kiLineSize << 2;
+	int32_t i = 0, j = 0, num = 0;	
+	int32_t iSumAvg = 0, iSumSqr = 0;
+	
+//	analysis_vaa_info_intra_core_c( pDataY, iLineSize, pBlock );
+	for ( ; j < 16; j += 4 )
+	{
+		num = 0;
+		for ( i = 0; i < 16; i += 4, num ++ )
+		{
+			pBlock[num]	=  pEncData[i          ] + pEncData[i+1          ] + pEncData[i+2          ] + pEncData[i+3          ];
+			pBlock[num]	+= pEncData[i+kiLineSize ] + pEncData[i+kiLineSize+1 ] + pEncData[i+kiLineSize+2 ] + pEncData[i+kiLineSize+3 ];
+			pBlock[num]	+= pEncData[i+kiLineSize2] + pEncData[i+kiLineSize2+1] + pEncData[i+kiLineSize2+2] + pEncData[i+kiLineSize2+3];
+			pBlock[num]	+= pEncData[i+kiLineSize3] + pEncData[i+kiLineSize3+1] + pEncData[i+kiLineSize3+2] + pEncData[i+kiLineSize3+3];
+			pBlock[num]	>>=  4;			
+		}
+		pBlock += 4;
+		pEncData += kiLineSize4; 
+	}
+
+	pBlock = &uiAvgBlock[0];
+	i = 4;
+	for ( ; i > 0; --i )
+	{
+		iSumAvg += pBlock[0] + pBlock[1] + pBlock[2] + pBlock[3];
+		iSumSqr += pBlock[0] * pBlock[0] + pBlock[1] * pBlock[1] + pBlock[2] * pBlock[2] + pBlock[3] * pBlock[3];
+
+		pBlock += 4;
+	}
+
+
+	return /*variance =*/ (iSumSqr - ((iSumAvg * iSumAvg) >> 4));
+}
+
+// for pfGetVarianceFromIntraVaa function ptr adaptive by CPU features, 6/7/2010
+void InitIntraAnalysisVaaInfo( SWelsFuncPtrList *pFuncList, const uint32_t kuiCpuFlag )
+{
+	pFuncList->pfGetVarianceFromIntraVaa		= AnalysisVaaInfoIntra_c;
+	pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_c;
+	pFuncList->pfUpdateMbMv					= UpdateMbMv_c;
+	
+#if defined(X86_ASM)
+	if ( (kuiCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2 )
+	{
+		pFuncList->pfGetVarianceFromIntraVaa		= AnalysisVaaInfoIntra_sse2;	
+		pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_sse2;
+		pFuncList->pfUpdateMbMv					= UpdateMbMv_sse2;
+	}
+	if ( (kuiCpuFlag & WELS_CPU_SSSE3) == WELS_CPU_SSSE3 )
+	{
+		pFuncList->pfGetVarianceFromIntraVaa	= AnalysisVaaInfoIntra_ssse3;
+	}
+	if ( (kuiCpuFlag & WELS_CPU_SSE41) == WELS_CPU_SSE41 )
+	{
+		pFuncList->pfGetMbSignFromInterVaa	= MdInterAnalysisVaaInfo_sse41;
+	}
+#endif//X86_ASM
+}
+
+BOOL_T MdIntraAnalysisVaaInfo( sWelsEncCtx* pEncCtx, uint8_t* pEncMb )
+{	
+
+	SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;	
+	const int32_t kiLineSize  = pCurDqLayer->iEncStride[0];
+	const int32_t kiVariance	= pEncCtx->pFuncList->pfGetVarianceFromIntraVaa( pEncMb, kiLineSize );
+	return (kiVariance >= INTRA_VARIANCE_SAD_THRESHOLD);
+}
+
+void InitMeRefinePointer(SMeRefinePointer* pMeRefine, SMbCache* pMbCache, int32_t iStride)
+{
+	pMeRefine->pHalfPixH    = &pMbCache->pBufferInterPredMe[0] + iStride;
+	pMeRefine->pHalfPixV    = &pMbCache->pBufferInterPredMe[640] + iStride;
+
+	pMeRefine->pQuarPixBest= &pMbCache->pBufferInterPredMe[1280] + iStride;
+	pMeRefine->pQuarPixTmp  = &pMbCache->pBufferInterPredMe[1920] + iStride;
+}
+typedef struct TagQuarParams
+{	
+	int32_t iBestCost;
+	int32_t iBestHalfPix;
+	int32_t iStrideA;
+	int32_t iStrideB;
+	uint8_t * pRef;
+	uint8_t * pSrcB[4];
+	uint8_t * pSrcA[4];
+	int32_t iLms[4];
+	int32_t iBestQuarPix;
+}SQuarRefineParams;
+
+#define SWITCH_BEST_TMP_BUF(prev_best, curr_best){\
+	pParams->iBestCost = iCurCost;\
+	pTmp = prev_best;\
+	prev_best = curr_best;\
+	curr_best = pTmp;\
+}
+#define CALC_COST(me_buf, lm) ( pFunc->sSampleDealingFuncs.pfMeCost[kuiPixel](pEncMb, iStrideEnc, me_buf, ME_REFINE_BUF_STRIDE) + lm )
+
+inline void MeRefineQuarPixel( SWelsFuncPtrList *pFunc, SWelsME* pMe, SMeRefinePointer* pMeRefine, const int32_t kiWidth, const int32_t kiHeight,SQuarRefineParams *pParams, int32_t iStrideEnc )
+{
+	PWelsSampleAveragingFunc *pSampleAvg	= pFunc->sMcFuncs.pfSampleAveraging;
+	const int32_t kiAvgIndex		= kiWidth >> 4;
+	int32_t iCurCost;
+	uint8_t *pEncMb				= pMe->pEncMb;
+	uint8_t *pTmp				= NULL;
+	const uint8_t kuiPixel		= pMe->uiPixel;
+	
+	pSampleAvg[kiAvgIndex](pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[0], ME_REFINE_BUF_STRIDE,pParams->pSrcB[0], pParams->iStrideA, kiHeight);	
+
+	iCurCost = CALC_COST(pMeRefine->pQuarPixTmp,pParams->iLms[0]);
+	if (iCurCost < pParams->iBestCost)
+	{
+		pParams->iBestQuarPix =	ME_QUAR_PIXEL_TOP;
+		SWITCH_BEST_TMP_BUF(pMeRefine->pQuarPixBest,pMeRefine->pQuarPixTmp);
+	}
+	//=========================(0, 1)=======================//
+	pSampleAvg[kiAvgIndex](pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE, pParams->pSrcA[1], 
+		ME_REFINE_BUF_STRIDE,pParams->pSrcB[1], pParams->iStrideA, kiHeight);
+	iCurCost = CALC_COST(pMeRefine->pQuarPixTmp,pParams->iLms[1]);
+	if (iCurCost < pParams->iBestCost)
+	{
+		pParams->iBestQuarPix = ME_QUAR_PIXEL_BOTTOM;
+		SWITCH_BEST_TMP_BUF(pMeRefine->pQuarPixBest,pMeRefine->pQuarPixTmp);
+	}
+	//==========================(-1, 0)=========================//
+	pSampleAvg[kiAvgIndex](pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE,pParams->pSrcA[2], 
+		ME_REFINE_BUF_STRIDE,pParams->pSrcB[2], pParams->iStrideB, kiHeight);	
+	iCurCost = CALC_COST(pMeRefine->pQuarPixTmp,pParams->iLms[2]);
+	if (iCurCost < pParams->iBestCost)
+	{
+		pParams->iBestQuarPix = ME_QUAR_PIXEL_LEFT;
+		SWITCH_BEST_TMP_BUF(pMeRefine->pQuarPixBest,pMeRefine->pQuarPixTmp);
+	}
+	//==========================(1, 0)=========================//
+	pSampleAvg[kiAvgIndex](pMeRefine->pQuarPixTmp, ME_REFINE_BUF_STRIDE,pParams->pSrcA[3], 
+		ME_REFINE_BUF_STRIDE,	pParams->pSrcB[3], pParams->iStrideB,  kiHeight);
+
+	iCurCost = CALC_COST(pMeRefine->pQuarPixTmp,pParams->iLms[3]);
+	if (iCurCost < pParams->iBestCost)
+	{
+		pParams->iBestQuarPix = ME_QUAR_PIXEL_RIGHT;
+		SWITCH_BEST_TMP_BUF(pMeRefine->pQuarPixBest,pMeRefine->pQuarPixTmp);
+	}
+}
+
+void MeRefineFracPixel(sWelsEncCtx* pEncCtx, uint8_t* pMemPredInterMb, SWelsME* pMe, 
+						  SMeRefinePointer* pMeRefine, int32_t iWidth, int32_t iHeight)
+{
+	SWelsFuncPtrList *pFunc= pEncCtx->pFuncList;
+	int16_t iMvx = pMe->sMv.iMvX;
+	int16_t iMvy = pMe->sMv.iMvY;
+
+	int16_t iHalfMvx = iMvx;
+	int16_t iHalfMvy = iMvy;
+	const int32_t kiStrideEnc = pEncCtx->pCurDqLayer->iEncStride[0];
+	const int32_t kiStrideRef = pEncCtx->pCurDqLayer->pRefPic->iLineSize[0];
+    
+	uint8_t* pEncData = pMe->pEncMb;
+	uint8_t* pRef = pMe->pRefMb;//091010
+
+	int32_t iBestQuarPix = ME_NO_BEST_QUAR_PIXEL;
+
+	SQuarRefineParams sParams;
+	static int32_t iMvQuarAddX[10] = {0,0,-1,1,0,0,0,-1,1,0};
+	int32_t *pMvQuarAddY = iMvQuarAddX + 3;
+	uint8_t* pBestPredInter = pRef;
+	int32_t iInterBlk4Stride = ME_REFINE_BUF_STRIDE;
+
+	int32_t iBestCost;
+	int32_t iCurCost;
+	int32_t iBestHalfPix;
+
+	if ((pFunc->sSampleDealingFuncs.pfMeCost == pFunc->sSampleDealingFuncs.pfSampleSatd) && (pFunc->sSampleDealingFuncs.pfMdCost == pFunc->sSampleDealingFuncs.pfSampleSatd))
+	{
+		iBestCost = pMe->uSadPredISatd.uiSatd + COST_MVD(pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
+	}
+	else
+	{
+		iBestCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel]( pEncData, kiStrideEnc, pRef, kiStrideRef ) +
+			COST_MVD(pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY);
+	}
+
+	iBestHalfPix = REFINE_ME_NO_BEST_HALF_PIXEL;
+
+	pFunc->sMcFuncs.pfLumaHalfpelVer( pRef-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixV, ME_REFINE_BUF_STRIDE, iWidth, iHeight+1 );
+
+	//step 1: get [iWidth][iHeight+1] half pixel from vertical filter
+	//===========================(0, -2)==============================//
+	iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel](pEncData, kiStrideEnc, pMeRefine->pHalfPixV, ME_REFINE_BUF_STRIDE) +
+		COST_MVD( pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy - 2 - pMe->sMvp.iMvY );
+	if(iCurCost < iBestCost)
+	{
+		iBestCost = iCurCost;
+		iBestHalfPix = REFINE_ME_HALF_PIXEL_TOP;
+       	pBestPredInter = pMeRefine->pHalfPixV;
+	}
+	//===========================(0, 2)==============================//
+	iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel](pEncData, kiStrideEnc, pMeRefine->pHalfPixV+ME_REFINE_BUF_STRIDE, ME_REFINE_BUF_STRIDE) +
+		COST_MVD( pMe->pMvdCost, iMvx - pMe->sMvp.iMvX, iMvy + 2 - pMe->sMvp.iMvY );
+	if(iCurCost < iBestCost)
+	{
+		iBestCost = iCurCost;
+		iBestHalfPix = REFINE_ME_HALF_PIXEL_BOTTOM;
+       	pBestPredInter = pMeRefine->pHalfPixV+ME_REFINE_BUF_STRIDE;
+	}
+	pFunc->sMcFuncs.pfLumaHalfpelHor( pRef-1, kiStrideRef, pMeRefine->pHalfPixH, ME_REFINE_BUF_STRIDE, iWidth+1, iHeight );
+	//step 2: get [iWidth][iHeight+1] half pixel from horizon filter
+	
+	//===========================(-2, 0)==============================//
+	iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel](pEncData, kiStrideEnc, pMeRefine->pHalfPixH, ME_REFINE_BUF_STRIDE) +
+		COST_MVD( pMe->pMvdCost, iMvx - 2 - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY );
+	if(iCurCost < iBestCost)
+	{
+		iBestCost = iCurCost;
+		iBestHalfPix = REFINE_ME_HALF_PIXEL_LEFT;
+       	pBestPredInter = pMeRefine->pHalfPixH;
+	}
+	//===========================(2, 0)===============================//
+	iCurCost = pFunc->sSampleDealingFuncs.pfMeCost[pMe->uiPixel](pEncData, kiStrideEnc, pMeRefine->pHalfPixH+1, ME_REFINE_BUF_STRIDE) +
+		COST_MVD( pMe->pMvdCost, iMvx + 2 - pMe->sMvp.iMvX, iMvy - pMe->sMvp.iMvY );
+	if(iCurCost < iBestCost)
+	{
+		iBestCost = iCurCost;
+		iBestHalfPix = REFINE_ME_HALF_PIXEL_RIGHT;
+       	pBestPredInter = pMeRefine->pHalfPixH+1;
+	}
+
+	sParams.iBestCost = iBestCost;
+	sParams.iBestHalfPix = iBestHalfPix;
+	sParams.pRef = pRef;
+	sParams.iBestQuarPix = ME_NO_BEST_QUAR_PIXEL;
+
+	//step 5: if no best half-pixel prediction, try quarter pixel prediction
+	//        if yes, must get [X+1][X+1] half-pixel from (2, 2) horizontal and vertical filter
+	if (REFINE_ME_NO_BEST_HALF_PIXEL == iBestHalfPix)
+	{
+		sParams.iStrideA = kiStrideRef;
+		sParams.iStrideB = kiStrideRef;
+		sParams.pSrcA[0] = pMeRefine->pHalfPixV;
+		sParams.pSrcA[1] = pMeRefine->pHalfPixV+ME_REFINE_BUF_STRIDE;
+		sParams.pSrcA[2] = pMeRefine->pHalfPixH;
+		sParams.pSrcA[3] = pMeRefine->pHalfPixH+1;
+
+		sParams.pSrcB[0] = sParams.pSrcB[1] = sParams.pSrcB[2] = sParams.pSrcB[3] = pRef;
+
+		sParams.iLms[0] = COST_MVD( pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy - 1 - pMe->sMvp.iMvY ); 
+		sParams.iLms[1] = COST_MVD( pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy + 1 - pMe->sMvp.iMvY );
+		sParams.iLms[2] = COST_MVD( pMe->pMvdCost, iHalfMvx - 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY );
+		sParams.iLms[3] = COST_MVD( pMe->pMvdCost, iHalfMvx + 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY );
+	}	
+	else //must get [X+1][X+1] half-pixel from (2, 2) horizontal and vertical filter
+	{
+		switch(iBestHalfPix)
+		{
+		case REFINE_ME_HALF_PIXEL_LEFT:
+			{
+                pMeRefine->pHalfPixHV = pMeRefine->pHalfPixV;//reuse pBuffer, here only h&hv
+				pFunc->sMcFuncs.pfLumaHalfpelCen( pRef-1-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV,ME_REFINE_BUF_STRIDE,iWidth+1, iHeight+1 );
+				
+				iHalfMvx -= 2;
+				sParams.iStrideA = ME_REFINE_BUF_STRIDE;
+				sParams.iStrideB = kiStrideRef;
+				sParams.pSrcA[0] = pMeRefine->pHalfPixH;
+				sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
+				sParams.pSrcB[0] = pMeRefine->pHalfPixHV;
+				sParams.pSrcB[1] = pMeRefine->pHalfPixHV+ME_REFINE_BUF_STRIDE;
+				sParams.pSrcB[2] = pRef - 1;
+				sParams.pSrcB[3] = pRef;
+
+			}break;
+		case REFINE_ME_HALF_PIXEL_RIGHT:
+			{
+                pMeRefine->pHalfPixHV = pMeRefine->pHalfPixV;//reuse pBuffer, here only h&hv
+				pFunc->sMcFuncs.pfLumaHalfpelCen( pRef-1-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV,ME_REFINE_BUF_STRIDE,iWidth+1, iHeight+1 );
+ 				iHalfMvx += 2;
+				sParams.iStrideA = ME_REFINE_BUF_STRIDE;
+				sParams.iStrideB = kiStrideRef;
+				sParams.pSrcA[0] = pMeRefine->pHalfPixH+1;
+				sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
+				sParams.pSrcB[0] = pMeRefine->pHalfPixHV+1;
+				sParams.pSrcB[1] = pMeRefine->pHalfPixHV+1+ ME_REFINE_BUF_STRIDE;
+				sParams.pSrcB[2] = pRef;
+				sParams.pSrcB[3] = pRef + 1;
+			}break;
+		case REFINE_ME_HALF_PIXEL_TOP:
+			{
+                pMeRefine->pHalfPixHV = pMeRefine->pHalfPixH;//reuse pBuffer, here only v&hv
+				pFunc->sMcFuncs.pfLumaHalfpelCen( pRef-1-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV,ME_REFINE_BUF_STRIDE,iWidth+1, iHeight+1 );
+		
+               	iHalfMvy -= 2;
+				sParams.iStrideA = kiStrideRef;
+				sParams.iStrideB = ME_REFINE_BUF_STRIDE;
+				sParams.pSrcA[0] = pMeRefine->pHalfPixV;				
+				sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
+				sParams.pSrcB[0] = pRef - kiStrideRef;
+				sParams.pSrcB[1] = pRef;
+				sParams.pSrcB[2] = pMeRefine->pHalfPixHV;
+				sParams.pSrcB[3] = pMeRefine->pHalfPixHV+1;		
+			}break;
+		case REFINE_ME_HALF_PIXEL_BOTTOM:
+			{
+                pMeRefine->pHalfPixHV = pMeRefine->pHalfPixH;//reuse pBuffer, here only v&hv
+				pFunc->sMcFuncs.pfLumaHalfpelCen( pRef-1-kiStrideRef, kiStrideRef, pMeRefine->pHalfPixHV,ME_REFINE_BUF_STRIDE,iWidth+1, iHeight+1 );
+			    iHalfMvy += 2;
+				sParams.iStrideA = kiStrideRef;
+				sParams.iStrideB = ME_REFINE_BUF_STRIDE;
+				sParams.pSrcA[0] = pMeRefine->pHalfPixV + ME_REFINE_BUF_STRIDE;
+				sParams.pSrcA[3] = sParams.pSrcA[2] = sParams.pSrcA[1] = sParams.pSrcA[0];
+				sParams.pSrcB[0] = pRef;
+				sParams.pSrcB[1] = pRef + kiStrideRef;
+				sParams.pSrcB[2] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE;
+				sParams.pSrcB[3] = pMeRefine->pHalfPixHV + ME_REFINE_BUF_STRIDE + 1;	
+			}break;
+		default:
+			break;
+		}
+		sParams.iLms[0] = COST_MVD( pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy - 1 - pMe->sMvp.iMvY );
+		sParams.iLms[1] = COST_MVD( pMe->pMvdCost, iHalfMvx - pMe->sMvp.iMvX, iHalfMvy + 1 - pMe->sMvp.iMvY );
+		sParams.iLms[2] = COST_MVD( pMe->pMvdCost, iHalfMvx - 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY );
+		sParams.iLms[3] = COST_MVD( pMe->pMvdCost, iHalfMvx + 1 - pMe->sMvp.iMvX, iHalfMvy - pMe->sMvp.iMvY );
+	}
+	MeRefineQuarPixel(pFunc, pMe, pMeRefine, iWidth, iHeight, &sParams, kiStrideEnc);
+	
+	if(iBestCost > sParams.iBestCost)
+	{
+		pBestPredInter = pMeRefine->pQuarPixBest;
+		iBestCost = sParams.iBestCost;
+	}
+	iBestQuarPix = sParams.iBestQuarPix;
+
+	//update final best MV
+	pMe->sMv.iMvX = iHalfMvx + iMvQuarAddX[iBestQuarPix];
+	pMe->sMv.iMvY = iHalfMvy + pMvQuarAddY[iBestQuarPix];
+	pMe->uiSatdCost = iBestCost;
+
+	//No half or quarter pixel best, so do MC with integer pixel MV
+	if ( iBestHalfPix+iBestQuarPix == NO_BEST_FRAC_PIX )
+	{
+		pBestPredInter = pRef;
+		iInterBlk4Stride = kiStrideRef;
+	}	
+	if ( MB_WIDTH_LUMA == iWidth && MB_HEIGHT_LUMA == iHeight ) //P16x16
+	{
+		pFunc->pfCopy16x16NotAligned( pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride );	// dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
+	}
+	else if ( MB_WIDTH_LUMA == iWidth && MB_HEIGHT_CHROMA == iHeight ) //P16x8
+	{
+		pFunc->pfCopy16x8NotAligned( pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride );	// dst can be align with 16 bytes, but not sure at pSrc, 12/29/2011
+	}
+	else if ( MB_WIDTH_CHROMA == iWidth && MB_HEIGHT_LUMA == iHeight ) //P8x16
+	{
+		pFunc->pfCopy8x16Aligned( pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride );		
+	}
+	else //P8x8
+	{
+		pFunc->pfCopy8x8Aligned( pMemPredInterMb, MB_WIDTH_LUMA, pBestPredInter, iInterBlk4Stride );
+	}	
+}
+
+void InitBlkStrideWithRef(int32_t* pBlkStride, const int32_t kiStrideRef)
+{
+	static const uint8_t kuiStrideX[16] =
+	{
+		0, 4 , 0, 4 ,
+		8, 12, 8, 12,
+		0, 4 , 0, 4 ,
+		8, 12, 8, 12
+	};
+	static const uint8_t kuiStrideY[16] =
+	{
+		0, 0, 4 , 4 ,
+		0, 0, 4 , 4 ,
+		8, 8, 12, 12,
+		8, 8, 12, 12
+	};
+	int32_t i;
+
+	for (i = 0; i < 16; i+=4)
+	{
+		pBlkStride[i  ] = kuiStrideX[i  ] + kuiStrideY[i  ] * kiStrideRef; 
+		pBlkStride[i+1] = kuiStrideX[i+1] + kuiStrideY[i+1] * kiStrideRef; 
+		pBlkStride[i+2] = kuiStrideX[i+2] + kuiStrideY[i+2] * kiStrideRef; 
+		pBlkStride[i+3] = kuiStrideX[i+3] + kuiStrideY[i+3] * kiStrideRef; 
+	}
+}
+
+/*
+ * iMvdSz = (648*2+1) or (972*2+1);
+ */
+void MvdCostInit( uint16_t* pMvdCostInter, const int32_t kiMvdSz )
+{	
+	const int32_t kiSz		= kiMvdSz >> 1;
+	uint16_t *pNegMvd		= pMvdCostInter;
+	uint16_t *pPosMvd		= pMvdCostInter+kiSz+1;
+	const int32_t *kpQpLambda= &g_kiQpCostTable[0];
+	int32_t i,j;
+	
+	for( i = 0; i < 52; ++ i )
+	{
+		const uint16_t kiLambda = kpQpLambda[i];		
+		int32_t iNegSe = -kiSz;
+		int32_t iPosSe = 1;
+
+		for (j = 0; j < kiSz; j += 4)
+		{
+			*pNegMvd++	= kiLambda * BsSizeSE(iNegSe++);
+			*pNegMvd++	= kiLambda * BsSizeSE(iNegSe++);
+			*pNegMvd++	= kiLambda * BsSizeSE(iNegSe++);
+			*pNegMvd++	= kiLambda * BsSizeSE(iNegSe++);
+			
+			*pPosMvd++	= kiLambda * BsSizeSE(iPosSe++);
+			*pPosMvd++	= kiLambda * BsSizeSE(iPosSe++);
+			*pPosMvd++	= kiLambda * BsSizeSE(iPosSe++);
+			*pPosMvd++	= kiLambda * BsSizeSE(iPosSe++);			
+		}
+		*pNegMvd = kiLambda;
+		pNegMvd += kiSz+1;
+		pPosMvd += kiSz+1;
+	}
+}
+
+void PredictSad( int8_t* pRefIndexCache, int32_t* pSadCostCache, int32_t uiRef, int32_t * pSadPred )
+{    
+    const int32_t kiRefB	= pRefIndexCache[1];//top g_uiCache12_8x8RefIdx[0] - 4
+    int32_t iRefC			= pRefIndexCache[5];//top-right g_uiCache12_8x8RefIdx[0] - 2    
+	const int32_t kiRefA	= pRefIndexCache[6];//left g_uiCache12_8x8RefIdx[0] - 1
+    const int32_t kiSadB		= pSadCostCache[1];
+    int32_t iSadC			= pSadCostCache[2];
+	const int32_t kiSadA		= pSadCostCache[3];
+
+    int32_t iCount;
+
+    if( iRefC == REF_NOT_AVAIL )
+    {
+		iRefC = pRefIndexCache[0];//top-left g_uiCache12_8x8RefIdx[0] - 4 - 1
+        iSadC  = pSadCostCache[0];
+    }
+
+    if( kiRefB == REF_NOT_AVAIL && iRefC == REF_NOT_AVAIL && kiRefA != REF_NOT_AVAIL )
+    {
+        * pSadPred = kiSadA;
+    }
+	else
+	{
+		iCount  = (uiRef == kiRefA)<<MB_LEFT_BIT;
+		iCount |= (uiRef == kiRefB)<<MB_TOP_BIT;
+		iCount |= (uiRef == iRefC)<<MB_TOPRIGHT_BIT;
+		switch(iCount) 
+		{
+			case LEFT_MB_POS:// A
+				*pSadPred = kiSadA;
+				break;
+			case TOP_MB_POS:// B
+				*pSadPred = kiSadB;
+				break;
+			case TOPRIGHT_MB_POS:// C or D
+				*pSadPred = iSadC;
+				break;
+			default:
+				*pSadPred = WELS_MEDIAN( kiSadA, kiSadB, iSadC );
+				break;
+		}
+	}
+
+#define REPLACE_SAD_MULTIPLY(x)   ((x) - (x>>3) + (x >>5))    // it's 0.90625, very close with 0.9
+	iCount = (*pSadPred)<<6;    // here *64 will not overflow. SAD range 0~ 255*256(max 2^16), int32_t is enough
+	*pSadPred = (REPLACE_SAD_MULTIPLY(iCount) + 32)>>6;
+#undef REPLACE_SAD_MULTIPLY
+}
+
+
+void PredictSadSkip( int8_t* pRefIndexCache, bool_t* pMbSkipCache, int32_t* pSadCostCache, int32_t uiRef, int32_t * iSadPredSkip )
+{    
+    const int32_t kiRefB	= pRefIndexCache[1];//top g_uiCache12_8x8RefIdx[0] - 4
+    int32_t iRefC			= pRefIndexCache[5];//top-right g_uiCache12_8x8RefIdx[0] - 2
+	const int32_t kiRefA	= pRefIndexCache[6];//left g_uiCache12_8x8RefIdx[0] - 1    
+    const int32_t kiSadB		= (pMbSkipCache[1]==1 ? pSadCostCache[1] : 0);
+    int32_t iSadC			= (pMbSkipCache[2]==1 ? pSadCostCache[2] : 0);
+	const int32_t kiSadA		= (pMbSkipCache[3]==1 ? pSadCostCache[3] : 0);
+	int32_t iRefSkip		= pMbSkipCache[2];
+
+    int32_t iCount = 0;
+
+    if( iRefC == REF_NOT_AVAIL )
+    {
+		iRefC = pRefIndexCache[0];//top-left g_uiCache12_8x8RefIdx[0] - 4 - 1
+        iSadC  = (pMbSkipCache[0]==1 ? pSadCostCache[0] : 0);
+		iRefSkip = pMbSkipCache[0];
+    }
+
+    if( kiRefB == REF_NOT_AVAIL && iRefC == REF_NOT_AVAIL && kiRefA != REF_NOT_AVAIL )
+    {
+        * iSadPredSkip = kiSadA;
+    }
+	else
+	{
+		iCount  = ((uiRef == kiRefA) && (pMbSkipCache[3]==1))<<MB_LEFT_BIT;
+		iCount |= ((uiRef == kiRefB) && (pMbSkipCache[1]==1))<<MB_TOP_BIT;
+		iCount |= ((uiRef == iRefC) && (iRefSkip==1))<<MB_TOPRIGHT_BIT;
+		switch(iCount) 
+		{
+			case LEFT_MB_POS:// A
+				*iSadPredSkip = kiSadA;
+				break;
+			case TOP_MB_POS:// B
+				*iSadPredSkip = kiSadB;
+				break;
+			case TOPRIGHT_MB_POS:// C or D
+				*iSadPredSkip = iSadC;
+				break;
+			default:
+				*iSadPredSkip = WELS_MEDIAN( kiSadA, kiSadB, iSadC );
+				break;
+		}
+	}
+}
+}
--- /dev/null
+++ b/codec/encoder/core/src/memory_align.cpp
@@ -1,0 +1,161 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "memory_align.h"
+#include "macros.h"
+
+namespace WelsSVCEnc {
+
+CMemoryAlign::CMemoryAlign( const uint32_t kuiCacheLineSize )
+#ifdef MEMORY_MONITOR
+:	m_nMemoryUsageInBytes( 0 )
+#endif//MEMORY_MONITOR
+{
+	if ( (kuiCacheLineSize == 0) || (kuiCacheLineSize & 0x0f) )	
+		m_nCacheLineSize	= 0x10;
+	else
+		m_nCacheLineSize	= kuiCacheLineSize;
+	
+#ifdef MEMORY_CHECK
+	m_fpMemChkPoint		= fopen("./enc_mem_check_point.txt",  "wt+");
+	m_nCountRequestNum	= 0;
+#endif//MEMORY_CHECK
+}
+
+CMemoryAlign::~CMemoryAlign()
+{
+#ifdef MEMORY_MONITOR
+	assert( m_nMemoryUsageInBytes == 0 );
+#endif//MEMORY_MONITOR
+
+#ifdef MEMORY_CHECK	
+	fclose(m_fpMemChkPoint);
+	m_fpMemChkPoint = NULL;
+
+	m_nCountRequestNum	= 0;
+#endif//MEMORY_CHECK
+}
+
+void* CMemoryAlign::WelsMallocz( const uint32_t kuiSize, const str_t *kpTag )
+{
+	void *pPointer = WelsMalloc( kuiSize, kpTag );	
+	if ( NULL == pPointer )
+	{
+		return NULL;
+	}	
+	// zero memory
+	memset( pPointer, 0, kuiSize );
+	
+	return pPointer;
+}
+
+void* CMemoryAlign::WelsMalloc( const uint32_t kuiSize, const str_t *kpTag )
+{
+	const int32_t kiSizeOfVoidPointer	= sizeof( void ** );
+	const int32_t kiSizeOfInt				= sizeof( int32_t );
+	const int32_t kiAlignedBytes		= m_nCacheLineSize - 1;
+	const int32_t kiTrialRequestedSize	= kuiSize + kiAlignedBytes + kiSizeOfVoidPointer + kiSizeOfInt;
+#if MEMORY_REQUEST_ALIGN_BYTES
+	// ensure 4 bytes boundary aligned memory request, unused extra bytes padding in pData payload
+	const int32_t kiActualRequestedSize	= WELS_ALIGN(kiTrialRequestedSize, MEMORY_REQUEST_ALIGN_BYTES);
+	const uint32_t kiPayloadSize			= kuiSize + MEMORY_REQUEST_ALIGN_BYTES - (kiTrialRequestedSize & (MEMORY_REQUEST_ALIGN_BYTES-1));
+#else
+	const int32_t kiActualRequestedSize	= kiTrialRequestedSize;
+	const uint32_t kiPayloadSize			= kuiSize;
+#endif//MEMORY_REQUEST_ALIGN_BYTES
+
+    uint8_t* pBuf		= (uint8_t *) malloc( kiActualRequestedSize );
+#ifdef MEMORY_CHECK	
+	if (m_fpMemChkPoint != NULL)
+	{
+		if ( kpTag != NULL )
+            fprintf( m_fpMemChkPoint, "WelsMalloc(), 0x%x : actual uiSize:\t%d\tbytes, input uiSize: %d bytes, %d - %s\n", (void *)pBuf, kiActualRequestedSize, kuiSize, m_nCountRequestNum++, kpTag );
+		else
+			fprintf( m_fpMemChkPoint, "WelsMalloc(), 0x%x : actual uiSize:\t%d\tbytes, input uiSize: %d bytes, %d \n", (void *)pBuf, kiActualRequestedSize, kuiSize, m_nCountRequestNum++ );
+		fflush( m_fpMemChkPoint);
+	}
+#endif
+	uint8_t* pAlignedBuffer;
+	
+	if ( NULL == pBuf )
+		return NULL;
+
+    pAlignedBuffer = pBuf + kiAlignedBytes + kiSizeOfVoidPointer + kiSizeOfInt;
+    pAlignedBuffer -= ((int32_t) pAlignedBuffer & kiAlignedBytes);
+    *( (void **) ( pAlignedBuffer - kiSizeOfVoidPointer ) ) = pBuf;
+    *( (int32_t *) ( pAlignedBuffer - (kiSizeOfVoidPointer + kiSizeOfInt) ) ) = kiPayloadSize;
+
+#ifdef MEMORY_MONITOR
+	m_nMemoryUsageInBytes += kiActualRequestedSize;
+#endif//MEMORY_MONITOR
+
+    return pAlignedBuffer;
+}
+
+void CMemoryAlign::WelsFree( void* pPointer, const str_t *kpTag )
+{
+	if( pPointer )
+    {
+#ifdef MEMORY_MONITOR
+		const int32_t kiMemoryLength = *((int32_t *)((uint8_t *)pPointer- sizeof(void **) - sizeof(int32_t))) + m_nCacheLineSize - 1 + sizeof(void **) + sizeof(int32_t);
+		m_nMemoryUsageInBytes -= kiMemoryLength;
+#endif//MEMORY_MONITOR
+#ifdef MEMORY_CHECK		
+		if (m_fpMemChkPoint != NULL)
+		{
+			if ( kpTag != NULL )
+				fprintf( m_fpMemChkPoint, "WelsFree(), 0x%x - %s: \t%d\t bytes \n", (void *)(*( ( ( void **) pPointer ) - 1 )), kpTag, kiMemoryLength );
+			else
+				fprintf( m_fpMemChkPoint, "WelsFree(), 0x%x \n", (void *)(*( ( ( void **) pPointer ) - 1 )) );
+			fflush( m_fpMemChkPoint);
+		}
+#endif
+        free( *( ( ( void **) pPointer ) - 1 ) );
+    }
+}
+
+const uint32_t CMemoryAlign::WelsGetCacheLineSize() const
+{
+	return m_nCacheLineSize;
+}
+
+#if defined(MEMORY_MONITOR)
+const uint32_t CMemoryAlign::WelsGetMemoryUsage() const
+{
+	return m_nMemoryUsageInBytes;
+}
+#endif//MEMORY_MONITOR
+
+} // end of namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/mv_pred.cpp
@@ -1,0 +1,389 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	mv_pred.c
+ *
+ * \brief	Get MV predictor and update motion vector of mb cache
+ *
+ * \date	05/22/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "mv_pred.h"
+#include "ls_defines.h"
+namespace WelsSVCEnc {
+//basic pMv prediction unit for pMv width (4, 2, 1)
+void PredMv(const SMVComponentUnit* kpMvComp, int8_t iPartIdx, int8_t iPartW, int32_t iRef, SMVUnitXY* sMvp)
+{	
+	const uint8_t kuiLeftIdx		= g_kuiCache30ScanIdx[iPartIdx] - 1;
+	const uint8_t kuiTopIdx		= g_kuiCache30ScanIdx[iPartIdx] - 6;
+
+	int32_t iMatchRef;
+	int32_t iLeftRef = kpMvComp->iRefIndexCache[kuiLeftIdx];
+	int32_t iTopRef  = kpMvComp->iRefIndexCache[ kuiTopIdx];
+	int32_t iRightTopRef = kpMvComp->iRefIndexCache[kuiTopIdx + iPartW];
+	int32_t iDiagonalRef;
+	SMVUnitXY sMvA(kpMvComp->sMotionVectorCache[kuiLeftIdx]);
+	SMVUnitXY sMvB(kpMvComp->sMotionVectorCache[kuiTopIdx]);
+	SMVUnitXY sMvC;
+
+	if (REF_NOT_AVAIL == iRightTopRef) 
+	{
+		iDiagonalRef = kpMvComp->iRefIndexCache[ kuiTopIdx - 1];// left_top;
+		sMvC = kpMvComp->sMotionVectorCache[kuiTopIdx - 1];
+	}
+	else
+	{
+		iDiagonalRef = iRightTopRef;// right_top;
+		sMvC = kpMvComp->sMotionVectorCache[kuiTopIdx + iPartW];
+	}	
+
+	if ((REF_NOT_AVAIL == iTopRef) && (REF_NOT_AVAIL == iDiagonalRef) && iLeftRef != REF_NOT_AVAIL) 
+	{
+		*sMvp = sMvA;
+		return;
+	}
+
+	// b2[diag] b1[top] b0[left] is available!
+	iMatchRef  = (iRef == iLeftRef)	<<MB_LEFT_BIT;
+	iMatchRef |= (iRef == iTopRef)		<<MB_TOP_BIT;
+	iMatchRef |= (iRef == iDiagonalRef)<<MB_TOPRIGHT_BIT;
+	switch(iMatchRef) 
+	{
+		case LEFT_MB_POS:// A
+			*sMvp = sMvA;
+			break;
+		case TOP_MB_POS:// B
+			*sMvp = sMvB;
+			break;
+		case TOPRIGHT_MB_POS:// C or D
+			*sMvp = sMvC;
+			break;
+		default:
+			sMvp->iMvX = WELS_MEDIAN(sMvA.iMvX, sMvB.iMvX, sMvC.iMvX);
+			sMvp->iMvY = WELS_MEDIAN(sMvA.iMvY, sMvB.iMvY, sMvC.iMvY);
+			break;
+	}
+}
+void PredInter8x16Mv(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp)
+{
+	const SMVComponentUnit *kpMvComp = &pMbCache->sMvComponents;
+	if (0 == iPartIdx) 
+	{
+		const int8_t kiLeftRef = kpMvComp->iRefIndexCache[6];
+		if (iRef == kiLeftRef)
+		{
+			*sMvp = kpMvComp->sMotionVectorCache[6];
+			return;
+		}		
+	}
+	else // 1 == iPartIdx
+	{
+		int8_t iDiagonalRef = kpMvComp->iRefIndexCache[5]; //top-right
+		int8_t iIndex = 5;
+		if (REF_NOT_AVAIL == iDiagonalRef)
+		{
+			iDiagonalRef = kpMvComp->iRefIndexCache[2]; //top-left for 8*8 block(iIndex 1)
+			iIndex = 2;
+		}
+		if (iRef == iDiagonalRef) 
+		{
+			*sMvp = kpMvComp->sMotionVectorCache[iIndex];
+			return;
+		}	
+	}
+
+	PredMv(kpMvComp, iPartIdx, 2, iRef, sMvp);
+}
+void PredInter16x8Mv(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* sMvp)
+{
+	const SMVComponentUnit *kpMvComp = &pMbCache->sMvComponents;
+	if (0 == iPartIdx) 
+	{
+		const int8_t kiTopRef = kpMvComp->iRefIndexCache[1];
+		if (iRef == kiTopRef)
+		{
+			*sMvp = kpMvComp->sMotionVectorCache[1];
+			return;
+		}
+	}
+	else // 8 == iPartIdx
+	{
+		const int8_t kiLeftRef = kpMvComp->iRefIndexCache[18];
+		if (iRef == kiLeftRef) 
+		{
+			*sMvp = kpMvComp->sMotionVectorCache[18];
+			return;
+		}
+	}
+
+	PredMv(kpMvComp, iPartIdx, 4, iRef, sMvp);
+}
+void PredSkipMv(SMbCache* pMbCache, SMVUnitXY* sMvp)
+{	
+	const SMVComponentUnit *kpMvComp = &pMbCache->sMvComponents;
+	const int8_t kiLeftRef = kpMvComp->iRefIndexCache[6]; //A
+	const int8_t kiTopRef  = kpMvComp->iRefIndexCache[1]; //B
+
+	if (REF_NOT_AVAIL == kiLeftRef  || REF_NOT_AVAIL == kiTopRef ||
+		(0 == kiLeftRef && 0 == *(int32_t*)(&kpMvComp->sMotionVectorCache[6])) || 
+		(0 == kiTopRef  && 0 == *(int32_t*)(&kpMvComp->sMotionVectorCache[1])) )
+	{
+		ST32( sMvp, 0 );
+		return;
+	}
+
+	PredMv(kpMvComp, 0, 4, 0, sMvp);	
+}
+
+//update pMv and uiRefIndex cache for current MB, only for P_16*16 (SKIP inclusive)
+void UpdateP16x16MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int8_t kiRef, SMVUnitXY* pMv)
+{
+	// optimized 11/25/2011
+	SMVComponentUnit *pMvComp	= &pMbCache->sMvComponents;
+	const uint32_t kuiMv32			= LD32(pMv);
+	const uint64_t kuiMv64			= BUTTERFLY4x8(kuiMv32);
+	uint64_t uiMvBuf[8]			= { kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64, kuiMv64 };	
+	const uint16_t kuiRef16		= BUTTERFLY1x2(kiRef);
+	const uint32_t kuiRef32		= BUTTERFLY2x4(kuiRef16);
+
+	ST32( pCurMb->pRefIndex, kuiRef32 );
+	// update pMv range from 0~15
+	memcpy( pCurMb->sMv, uiMvBuf, sizeof(uiMvBuf) );	// confirmed_safe_unsafe_usage
+	
+	/*
+	 * blocks 0: 7~10, 1: 13~16, 2: 19~22, 3: 25~28
+	 */
+	pMvComp->iRefIndexCache[7]	= kiRef;
+	ST16(&pMvComp->iRefIndexCache[8], kuiRef16);
+	pMvComp->iRefIndexCache[10]	= kiRef;
+	pMvComp->iRefIndexCache[13]	= kiRef;
+	ST16(&pMvComp->iRefIndexCache[14], kuiRef16);
+	pMvComp->iRefIndexCache[16]	= kiRef;
+	pMvComp->iRefIndexCache[19]	= kiRef;
+	ST16(&pMvComp->iRefIndexCache[20], kuiRef16);
+	pMvComp->iRefIndexCache[22]	= kiRef;
+	pMvComp->iRefIndexCache[25]	= kiRef;
+	ST16(&pMvComp->iRefIndexCache[26], kuiRef16);
+	pMvComp->iRefIndexCache[28]	= kiRef;
+
+	/*
+	* blocks 0: 7~10, 1: 13~16, 2: 19~22, 3: 25~28
+	*/
+	pMvComp->sMotionVectorCache[7]	= *pMv;
+	ST64( &pMvComp->sMotionVectorCache[8], kuiMv64 );
+	pMvComp->sMotionVectorCache[10] = *pMv;	
+	pMvComp->sMotionVectorCache[13] = *pMv;
+	ST64( &pMvComp->sMotionVectorCache[14], kuiMv64 );
+	pMvComp->sMotionVectorCache[16] = *pMv;
+	pMvComp->sMotionVectorCache[19] = *pMv;
+	ST64( &pMvComp->sMotionVectorCache[20], kuiMv64 );
+	pMvComp->sMotionVectorCache[22] = *pMv;
+	pMvComp->sMotionVectorCache[25] = *pMv;
+	ST64( &pMvComp->sMotionVectorCache[26], kuiMv64 );
+	pMvComp->sMotionVectorCache[28] = *pMv;
+}
+
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P16x8 
+void UpdateP16x8MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv)
+{
+	// optimized 11/25/2011
+	SMVComponentUnit *pMvComp	= &pMbCache->sMvComponents;
+	const uint32_t kuiMv32			= LD32(pMv);
+	const uint64_t kuiMv64			= BUTTERFLY4x8(kuiMv32);
+	uint64_t uiMvBuf[4]			= { kuiMv64, kuiMv64, kuiMv64, kuiMv64 };
+	const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+	const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
+	const int16_t kiCacheIdx1	= 1+kiCacheIdx;
+	const int16_t kiCacheIdx3	= 3+kiCacheIdx;
+	const int16_t kiCacheIdx6	= 6+kiCacheIdx;
+	const int16_t kiCacheIdx7	= 7+kiCacheIdx;
+	const int16_t kiCacheIdx9	= 9+kiCacheIdx;
+	const uint16_t kuiRef16		= BUTTERFLY1x2(kiRef);
+
+	ST16( &pCurMb->pRefIndex[(kiPartIdx>>2)], kuiRef16 );
+	memcpy( &pCurMb->sMv[kiScan4Idx], uiMvBuf, sizeof(uiMvBuf) );	// confirmed_safe_unsafe_usage
+
+	/*
+	* blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
+	*/
+	pMvComp->iRefIndexCache[kiCacheIdx]		= kiRef;
+	ST16(&pMvComp->iRefIndexCache[kiCacheIdx1], kuiRef16);
+	pMvComp->iRefIndexCache[kiCacheIdx3]	= kiRef;
+	pMvComp->iRefIndexCache[kiCacheIdx6]	= kiRef;
+	ST16(&pMvComp->iRefIndexCache[kiCacheIdx7], kuiRef16);
+	pMvComp->iRefIndexCache[kiCacheIdx9]	= kiRef;
+
+	/*
+	* blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
+	*/
+	pMvComp->sMotionVectorCache[kiCacheIdx]	= *pMv;
+	ST64( &pMvComp->sMotionVectorCache[kiCacheIdx1], kuiMv64 );
+	pMvComp->sMotionVectorCache[kiCacheIdx3]= *pMv;	
+	pMvComp->sMotionVectorCache[kiCacheIdx6]= *pMv;
+	ST64( &pMvComp->sMotionVectorCache[kiCacheIdx7], kuiMv64 );
+	pMvComp->sMotionVectorCache[kiCacheIdx9]= *pMv;
+}
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P8x16
+void update_P8x16_motion_info(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv)
+{
+	// optimized 11/25/2011
+	SMVComponentUnit *pMvComp	= &pMbCache->sMvComponents;
+	const uint32_t kuiMv32			= LD32(pMv);
+	const uint64_t kuiMv64			= BUTTERFLY4x8(kuiMv32);
+	const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+	const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
+	const int16_t kiCacheIdx1	= 1+kiCacheIdx;
+	const int16_t kiCacheIdx3	= 3+kiCacheIdx;
+	const int16_t kiCacheIdx12	= 12+kiCacheIdx;
+	const int16_t kiCacheIdx13	= 13+kiCacheIdx;
+	const int16_t kiCacheIdx15	= 15+kiCacheIdx;
+	const int16_t kiBlkIdx		= kiPartIdx>>2;
+	const uint16_t kuiRef16		= BUTTERFLY1x2(kiRef);
+		
+	pCurMb->pRefIndex[kiBlkIdx]	= kiRef;
+	pCurMb->pRefIndex[2+kiBlkIdx]= kiRef;
+	ST64( &pCurMb->sMv[kiScan4Idx], kuiMv64 );
+	ST64( &pCurMb->sMv[4+kiScan4Idx], kuiMv64 );
+	ST64( &pCurMb->sMv[8+kiScan4Idx], kuiMv64 );
+	ST64( &pCurMb->sMv[12+kiScan4Idx], kuiMv64 );
+
+	/*
+	* blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
+	*/
+	pMvComp->iRefIndexCache[kiCacheIdx]	= kiRef;
+	ST16(&pMvComp->iRefIndexCache[kiCacheIdx1], kuiRef16);
+	pMvComp->iRefIndexCache[kiCacheIdx3]	= kiRef;
+	pMvComp->iRefIndexCache[kiCacheIdx12]	= kiRef;
+	ST16(&pMvComp->iRefIndexCache[kiCacheIdx13], kuiRef16);
+	pMvComp->iRefIndexCache[kiCacheIdx15]	= kiRef;
+
+	/*
+	* blocks 0: g_kuiCache30ScanIdx[iPartIdx]~g_kuiCache30ScanIdx[iPartIdx]+3, 1: g_kuiCache30ScanIdx[iPartIdx]+6~g_kuiCache30ScanIdx[iPartIdx]+9
+	*/
+	pMvComp->sMotionVectorCache[kiCacheIdx]	= *pMv;
+	ST64( &pMvComp->sMotionVectorCache[kiCacheIdx1], kuiMv64 );
+	pMvComp->sMotionVectorCache[kiCacheIdx3] = *pMv;	
+	pMvComp->sMotionVectorCache[kiCacheIdx12] = *pMv;
+	ST64( &pMvComp->sMotionVectorCache[kiCacheIdx13], kuiMv64 );
+	pMvComp->sMotionVectorCache[kiCacheIdx15] = *pMv;
+}
+//update uiRefIndex and pMv of both SMB and Mb_cache, only for P8x8
+void UpdateP8x8MotionInfo(SMbCache* pMbCache, SMB* pCurMb, const int32_t kiPartIdx, const int8_t kiRef, SMVUnitXY* pMv)
+{
+	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
+	const uint32_t kuiMv32			= LD32(pMv);
+	const uint64_t kuiMv64			= BUTTERFLY4x8(kuiMv32);
+	const int16_t kiScan4Idx		= g_kuiMbCountScan4Idx[kiPartIdx];
+	const int16_t kiCacheIdx		= g_kuiCache30ScanIdx[kiPartIdx];
+	const int16_t kiCacheIdx1	= 1+kiCacheIdx;
+	const int16_t kiCacheIdx6	= 6+kiCacheIdx;
+	const int16_t kiCacheIdx7	= 7+kiCacheIdx;
+	
+	//mb
+	ST64( &pCurMb->sMv[  kiScan4Idx], kuiMv64 );
+	ST64( &pCurMb->sMv[4+kiScan4Idx], kuiMv64 );
+	
+	//cache
+   	pMvComp->iRefIndexCache[kiCacheIdx ] =
+   	pMvComp->iRefIndexCache[kiCacheIdx1] = 
+   	pMvComp->iRefIndexCache[kiCacheIdx6] =
+   	pMvComp->iRefIndexCache[kiCacheIdx7] = kiRef;
+	pMvComp->sMotionVectorCache[kiCacheIdx ] =
+	pMvComp->sMotionVectorCache[kiCacheIdx1] =
+	pMvComp->sMotionVectorCache[kiCacheIdx6] =
+	pMvComp->sMotionVectorCache[kiCacheIdx7] = *pMv;
+}
+
+//=========================update motion info(MV and ref_idx) into Mb_cache==========================
+//update pMv and uiRefIndex cache only for Mb_cache, only for P_16*16 (SKIP inclusive)
+
+//update uiRefIndex and pMv of only Mb_cache, only for P16x8 
+void UpdateP16x8Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv)
+{
+	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
+	int32_t i;	
+
+	for (i = 0; i < 2; i++, iPartIdx+=4) 
+	{
+		//cache
+		const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+    	pMvComp->iRefIndexCache[  kuiCacheIdx] =
+    	pMvComp->iRefIndexCache[1+kuiCacheIdx] =
+    	pMvComp->iRefIndexCache[6+kuiCacheIdx] =
+    	pMvComp->iRefIndexCache[7+kuiCacheIdx] = iRef;
+		pMvComp->sMotionVectorCache[  kuiCacheIdx] =
+		pMvComp->sMotionVectorCache[1+kuiCacheIdx] =
+		pMvComp->sMotionVectorCache[6+kuiCacheIdx] =
+		pMvComp->sMotionVectorCache[7+kuiCacheIdx] = *pMv;
+	}	
+}
+//update uiRefIndex and pMv of only Mb_cache, only for P8x16
+void UpdateP8x16Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t iRef, SMVUnitXY* pMv)
+{
+	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
+	int32_t i;
+
+	for (i = 0; i < 2; i++, iPartIdx+=8) 
+	{
+		//cache
+		const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+
+    	pMvComp->iRefIndexCache[  kuiCacheIdx] =
+    	pMvComp->iRefIndexCache[1+kuiCacheIdx] =
+    	pMvComp->iRefIndexCache[6+kuiCacheIdx] =
+    	pMvComp->iRefIndexCache[7+kuiCacheIdx] = iRef;
+		pMvComp->sMotionVectorCache[  kuiCacheIdx] =
+		pMvComp->sMotionVectorCache[1+kuiCacheIdx] =
+		pMvComp->sMotionVectorCache[6+kuiCacheIdx] =
+		pMvComp->sMotionVectorCache[7+kuiCacheIdx] = *pMv;
+	}	
+}
+
+//update uiRefIndex and pMv of only Mb_cache, only for P8x8
+void UpdateP8x8Motion2Cache(SMbCache* pMbCache, int32_t iPartIdx, int8_t pRef, SMVUnitXY* pMv)
+{
+	SMVComponentUnit *pMvComp = &pMbCache->sMvComponents;
+	const uint8_t kuiCacheIdx = g_kuiCache30ScanIdx[iPartIdx];
+	
+    pMvComp->iRefIndexCache[  kuiCacheIdx] =
+    pMvComp->iRefIndexCache[1+kuiCacheIdx] =
+    pMvComp->iRefIndexCache[6+kuiCacheIdx] =
+    pMvComp->iRefIndexCache[7+kuiCacheIdx] = pRef;
+	pMvComp->sMotionVectorCache[  kuiCacheIdx] =
+	pMvComp->sMotionVectorCache[1+kuiCacheIdx] =
+	pMvComp->sMotionVectorCache[6+kuiCacheIdx] =
+	pMvComp->sMotionVectorCache[7+kuiCacheIdx] = *pMv;
+}
+
+} // namespace WelsSVCEnc 
--- /dev/null
+++ b/codec/encoder/core/src/nal_encap.cpp
@@ -1,0 +1,248 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	nal_encap.c
+ *
+ * \brief	NAL pRawNal pData encapsulation
+ *
+ * \date	5/25/2009	Created
+ *
+ *************************************************************************************/
+#include "nal_encap.h"
+#include "svc_enc_golomb.h"
+#include "ls_defines.h"
+namespace WelsSVCEnc {
+/*!
+ * \brief	load an initialize NAL pRawNal pData	
+ */
+void WelsLoadNal( SWelsEncoderOutput *pEncoderOuput, const int32_t/*EWelsNalUnitType*/ kiType, const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc )
+{
+	SWelsEncoderOutput *pWelsEncoderOuput	= pEncoderOuput;
+	SWelsNalRaw *pRawNal			= &pWelsEncoderOuput->sNalList[ pWelsEncoderOuput->iNalIndex ];
+	SNalUnitHeader *sNalHeader	= &pRawNal->sNalExt.sNalHeader;
+	const int32_t kiStartPos		= (BsGetBitsPos(&pWelsEncoderOuput->sBsWrite) >> 3);
+
+	sNalHeader->eNalUnitType	= (EWelsNalUnitType)kiType;
+	sNalHeader->uiNalRefIdc		= (EWelsNalRefIdc)kiNalRefIdc;
+	sNalHeader->uiForbiddenZeroBit	= 0;	
+	
+	pRawNal->pRawData		= &pWelsEncoderOuput->pBsBuffer[kiStartPos];
+	pRawNal->iPayloadSize	= 0;
+}
+
+/*!
+ * \brief	unload pRawNal NAL
+ */
+void WelsUnloadNal( SWelsEncoderOutput *pEncoderOuput )
+{
+	SWelsEncoderOutput	*pWelsEncoderOuput= pEncoderOuput;
+	int32_t	*pIdx			= &pWelsEncoderOuput->iNalIndex;
+	SWelsNalRaw *pRawNal		= &pWelsEncoderOuput->sNalList[ *pIdx ];
+	const int32_t kiEndPos		= (BsGetBitsPos(&pWelsEncoderOuput->sBsWrite) >> 3);	
+
+	/* count payload size of pRawNal NAL */
+	pRawNal->iPayloadSize	= &pWelsEncoderOuput->pBsBuffer[kiEndPos] - pRawNal->pRawData;
+	
+	++ (*pIdx);
+}
+
+/*!
+ * \brief	load an initialize NAL pRawNal pData	
+ */
+void WelsLoadNalForSlice( SWelsSliceBs *pSliceBsIn, const int32_t/*EWelsNalUnitType*/ kiType, const int32_t/*EWelsNalRefIdc*/ kiNalRefIdc )
+{
+	SWelsSliceBs *pSliceBs		    = pSliceBsIn;
+	SWelsNalRaw *pRawNal		= &pSliceBs->sNalList[ pSliceBs->iNalIndex ];
+	SNalUnitHeader *sNalHeader	= &pRawNal->sNalExt.sNalHeader;
+	SBitStringAux *pBitStringAux	= &pSliceBs->sBsWrite;
+	const int32_t kiStartPos		    = (BsGetBitsPos(pBitStringAux) >> 3);
+	
+	sNalHeader->eNalUnitType	= (EWelsNalUnitType)kiType;
+	sNalHeader->uiNalRefIdc		= (EWelsNalRefIdc)kiNalRefIdc;
+	sNalHeader->uiForbiddenZeroBit	= 0;
+	
+	pRawNal->pRawData		= &pSliceBs->pBsBuffer[kiStartPos];
+	pRawNal->iPayloadSize	= 0;
+}
+
+/*!
+ * \brief	unload pRawNal NAL
+ */
+void WelsUnloadNalForSlice( SWelsSliceBs *pSliceBsIn )
+{
+	SWelsSliceBs *pSliceBs	        = pSliceBsIn;
+	int32_t	*pIdx			            = &pSliceBs->iNalIndex;
+	SWelsNalRaw *pRawNal		= &pSliceBs->sNalList[ *pIdx ];
+	SBitStringAux *pBitStringAux	= &pSliceBs->sBsWrite;
+	const int32_t kiEndPos		        = (BsGetBitsPos(pBitStringAux) >> 3);
+	
+	/* count payload size of pRawNal NAL */
+	pRawNal->iPayloadSize	= &pSliceBs->pBsBuffer[kiEndPos] - pRawNal->pRawData;
+	
+	++ (*pIdx);
+}
+
+/*!
+ * \brief	encode NAL with emulation forbidden three bytes checking
+ * \param	pDst			pDst NAL pData
+ * \param	pDstLen		length of pDst NAL output
+ * \param	annexeb		annexeb flag
+ * \param	pRawNal			pRawNal NAL pData
+ * \return	length of pDst NAL
+ */
+int32_t WelsEncodeNal( SWelsNalRaw *pRawNal, void *pDst, int32_t *pDstLen )
+{
+	uint8_t *pDstStart	    = (uint8_t *)pDst;
+	uint8_t *pDstPointer	= pDstStart;
+	uint8_t *pSrcPointer	= pRawNal->pRawData;
+	uint8_t *pSrcEnd		= pRawNal->pRawData + pRawNal->iPayloadSize;	
+	int32_t iZeroCount		= 0;
+	int32_t iNalLength		= 0;
+
+    static const uint8_t kuiStartCodePrefix[4] = { 0, 0, 0, 1 };
+    ST32( pDstPointer, LD32(&kuiStartCodePrefix[0]) );
+    pDstPointer += 4;
+
+	/* NAL Unit Header */
+	*pDstPointer++	= ( pRawNal->sNalExt.sNalHeader.uiNalRefIdc << 5 ) | (pRawNal->sNalExt.sNalHeader.eNalUnitType & 0x1f);
+
+	while ( pSrcPointer < pSrcEnd ) {
+		if ( iZeroCount == 2 && *pSrcPointer <= 3 )
+		{
+			*pDstPointer++	= 3;
+			iZeroCount		= 0;
+		}
+		if ( *pSrcPointer == 0 )
+		{
+			++ iZeroCount;
+		}
+		else
+		{
+			iZeroCount		= 0;
+		}
+		*pDstPointer++ = *pSrcPointer++;
+	}
+
+	/* count length of NAL Unit */
+	iNalLength	= pDstPointer - pDstStart;
+	if ( NULL != pDstLen )
+		*pDstLen	= iNalLength;
+	
+	return iNalLength;
+}
+
+/*!
+ * \brief	encode a nal into a pBuffer for any type of NAL, involved WelsEncodeNal introduced in AVC
+ *
+ * \param	pDst			pDst NAL pData
+ * \param	pDstLen		length of pDst NAL output
+ * \param	annexeb		annexeb flag
+ * \param	pRawNal			pRawNal NAL pData
+ * \param	pNalHeaderExt	pointer of SNalUnitHeaderExt
+ *
+ * \return	length of pDst NAL
+ */
+int32_t WelsEncodeNalExt( SWelsNalRaw *pRawNal, void *pNalHeaderExt, void *pDst, int32_t *pDstLen )
+{	
+	SNalUnitHeaderExt *sNalExt	= (SNalUnitHeaderExt *)pNalHeaderExt;
+	uint8_t *pDstStart				    = (uint8_t *)pDst;
+	uint8_t *pDstPointer				= pDstStart;
+	uint8_t *pSrcPointer				= pRawNal->pRawData;
+	uint8_t *pSrcEnd					= pRawNal->pRawData + pRawNal->iPayloadSize;	
+	int32_t iZeroCount					= 0;
+	int32_t iNalLength					= 0;
+	
+	if ( pRawNal->sNalExt.sNalHeader.eNalUnitType != NAL_UNIT_PREFIX && pRawNal->sNalExt.sNalHeader.eNalUnitType != NAL_UNIT_CODED_SLICE_EXT )
+	{
+		return WelsEncodeNal( pRawNal, pDst, pDstLen );
+	}
+	
+	/* FIXME this code doesn't check overflow */
+	
+    static const uint8_t kuiStartCodePrefixExt[4]= { 0, 0, 0, 1 };
+    ST32( pDstPointer, LD32(&kuiStartCodePrefixExt[0]) );
+    pDstPointer += 4;
+
+	/* NAL Unit Header */
+	*pDstPointer++	= ( pRawNal->sNalExt.sNalHeader.uiNalRefIdc << 5 ) | (pRawNal->sNalExt.sNalHeader.eNalUnitType & 0x1f);
+
+	/* NAL UNIT Extension Header */
+	*pDstPointer++ =	(0x80) |
+					(sNalExt->bIdrFlag << 6);
+
+	*pDstPointer++ =	(0x80) |
+					(sNalExt->uiDependencyId << 4);
+
+	*pDstPointer++ =	(sNalExt->uiTemporalId << 5) |
+					(sNalExt->bDiscardableFlag << 3) |
+					(0x07);
+	
+	while ( pSrcPointer < pSrcEnd ) {
+		if ( iZeroCount == 2 && *pSrcPointer <= 3 )
+		{
+			*pDstPointer++	= 3;
+			iZeroCount		= 0;
+		}
+		if ( *pSrcPointer == 0 )
+		{
+			++ iZeroCount;
+		}
+		else
+		{
+			iZeroCount		= 0;
+		}
+		*pDstPointer++ = *pSrcPointer++;
+	}
+	
+	/* count length of NAL Unit */
+	iNalLength	= pDstPointer - pDstStart;
+	if ( NULL != pDstLen )
+		*pDstLen	= iNalLength;
+
+	return iNalLength;	
+}
+
+/*!
+ * \brief	write prefix nal
+ */
+int32_t WelsWriteSVCPrefixNal( SBitStringAux *pBitStringAux, const int32_t kiNalRefIdc,
+						  const bool_t kbIdrFlag )
+{
+	if ( 0 < kiNalRefIdc ){
+		BsWriteOneBit( pBitStringAux, false/*bStoreRefBasePicFlag*/ );
+		BsWriteOneBit( pBitStringAux, false );
+		BsRbspTrailingBits( pBitStringAux );
+		BsFlush( pBitStringAux );
+	}
+	return 0;
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/picture_handle.cpp
@@ -1,0 +1,193 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	picture_handle.c
+ *
+ * \brief	picture pData handling
+ *
+ * \date	5/20/2009 Created
+ *
+ *************************************************************************************/
+#include <string.h>
+#include <assert.h>
+#include "picture_handle.h"
+#include "wels_const.h"
+#include "utils.h"
+#include "macros.h"
+
+namespace WelsSVCEnc {
+/*!
+ * \brief	alloc picture pData with borders for each plane based width and height of picture
+ * \param	cx				width of picture in pixels
+ * \param	cy				height of picture in pixels
+ * \param	need_data		need pData allocation
+ * \pram	need_expand		need borders expanding
+ * \return	successful if effective picture pointer returned, otherwise failed with NULL
+ */
+SPicture *AllocPicture( CMemoryAlign *pMa, const int32_t kiWidth , const int32_t kiHeight, bool_t bNeedMbInfo )
+{
+	SPicture *pPic = NULL;
+	int32_t iPicWidth = 0;
+	int32_t iPicHeight= 0;
+
+	int32_t iPicChromaWidth	= 0;
+	int32_t iPicChromaHeight	= 0;
+	int32_t iLumaSize			= 0;
+	int32_t iChromaSize			= 0;
+
+	pPic	= static_cast<SPicture*>(pMa->WelsMallocz( sizeof(SPicture), "pPic" ));
+
+	WELS_VERIFY_RETURN_IF( NULL, NULL == pPic );	
+	
+	iPicWidth	= WELS_ALIGN(kiWidth, MB_WIDTH_LUMA) + (PADDING_LENGTH<<1);	// with width of horizon
+	iPicHeight	= WELS_ALIGN(kiHeight, MB_HEIGHT_LUMA) + (PADDING_LENGTH<<1);	// with height of vertical
+	iPicChromaWidth	= iPicWidth >> 1;
+	iPicChromaHeight	= iPicHeight >> 1;
+	iPicWidth	= WELS_ALIGN( iPicWidth, 32 );	// 32(or 16 for chroma below) to match original imp. here instead of cache_line_size
+	iPicChromaWidth	= WELS_ALIGN( iPicChromaWidth, 16 );
+	iLumaSize	= iPicWidth * iPicHeight;
+	iChromaSize	= iPicChromaWidth * iPicChromaHeight;
+
+	pPic->pBuffer	= (uint8_t*)pMa->WelsMalloc(	iLumaSize /* luma */
+								  + (iChromaSize << 1) /* Cb,Cr */
+								  , "pPic->pBuffer"	);
+	WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->pBuffer, FreePicture(pMa, &pPic) );
+	pPic->iLineSize[0]	= iPicWidth;
+	pPic->iLineSize[1]	= pPic->iLineSize[2]	= iPicChromaWidth;
+	pPic->pData[0]	= pPic->pBuffer + (1+pPic->iLineSize[0]) * PADDING_LENGTH;
+	pPic->pData[1]	= pPic->pBuffer + iLumaSize + ( ((1+pPic->iLineSize[1]) * PADDING_LENGTH) >> 1 );
+	pPic->pData[2]	= pPic->pBuffer + iLumaSize + iChromaSize + ( ((1+pPic->iLineSize[2]) * PADDING_LENGTH) >> 1 );
+
+	pPic->iWidthInPixel	= kiWidth;
+	pPic->iHeightInPixel	= kiHeight;
+	pPic->iFrameNum			= -1;
+
+	pPic->bIsLongRef		= false;
+	pPic->iLongTermPicNum = -1;
+	pPic->uiRecieveConfirmed = 0;
+	pPic->iMarkFrameNum	= -1;
+
+	if ( bNeedMbInfo )
+	{	
+		const uint32_t kuiCountMbNum = ((15+kiWidth) >> 4) * ((15+kiHeight) >> 4);
+
+		pPic->uiRefMbType	= (uint32_t *)pMa->WelsMallocz( kuiCountMbNum * sizeof(uint32_t), "pPic->uiRefMbType" );
+		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->uiRefMbType, FreePicture(pMa, &pPic) );	
+
+		pPic->pRefMbQp	= (uint8_t *)pMa->WelsMallocz( kuiCountMbNum * sizeof(uint8_t), "pPic->bgd_mb_qp" );
+		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->pRefMbQp, FreePicture(pMa, &pPic) );
+
+		pPic->sMvList           = static_cast<SMVUnitXY *>(pMa->WelsMallocz( kuiCountMbNum*sizeof(SMVUnitXY), "pPic->sMvList" ));
+		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->sMvList, FreePicture(pMa, &pPic) );
+
+		pPic->pMbSkipSad       = (int32_t *)pMa->WelsMallocz( kuiCountMbNum*sizeof(int32_t), "pPic->pMbSkipSad" );
+		WELS_VERIFY_RETURN_PROC_IF( NULL, NULL == pPic->pMbSkipSad, FreePicture(pMa, &pPic) );
+	}	
+	
+	return pPic;
+}
+
+/*!
+ * \brief	free picture pData planes
+ * \param	pPic		picture pointer to be destoryed
+ * \return	none
+ */
+void FreePicture( CMemoryAlign *pMa, SPicture **ppPic )
+{	
+	if ( NULL != ppPic && NULL != *ppPic )
+	{
+		SPicture *pPic = *ppPic;
+
+		if ( NULL != pPic->pBuffer )
+		{
+			pMa->WelsFree( pPic->pBuffer, "pPic->pBuffer" );
+			pPic->pBuffer = NULL;
+		}
+		pPic->pBuffer		= NULL;
+		pPic->pData[0]	=
+		pPic->pData[1]	=
+		pPic->pData[2]	= NULL;
+		pPic->iLineSize[0] =
+		pPic->iLineSize[1] =
+		pPic->iLineSize[2] = 0;
+
+		pPic->iWidthInPixel		= 0;
+		pPic->iHeightInPixel	= 0;
+		pPic->iFrameNum			= -1;
+
+		pPic->bIsLongRef		= false;
+		pPic->uiRecieveConfirmed  = 0;
+		pPic->iLongTermPicNum  = -1;
+		pPic->iMarkFrameNum		= -1;
+
+		if ( pPic->uiRefMbType)
+		{
+			pMa->WelsFree( pPic->uiRefMbType, "pPic->bgd_mb_type" );
+			pPic->uiRefMbType = NULL;
+		}
+		if ( pPic->pRefMbQp)
+		{
+			pMa->WelsFree( pPic->pRefMbQp, "pPic->bgd_mb_qp" );
+			pPic->pRefMbQp = NULL;
+		}
+
+		if ( pPic->sMvList )
+		{
+			pMa->WelsFree( pPic->sMvList, "pPic->sMvList" );
+			pPic->sMvList = NULL;
+		}
+		if ( pPic->pMbSkipSad )
+		{
+			pMa->WelsFree( pPic->pMbSkipSad, "pPic->pMbSkipSad" );
+			pPic->pMbSkipSad = NULL;
+		}		
+		pMa->WelsFree( *ppPic, "pPic" );
+		*ppPic = NULL;
+	}
+}
+/*!
+* \brief	exchange two picture pData planes
+* \param	ppPic1		picture pointer to picture 1
+* \param	ppPic2		picture pointer to picture 2
+* \return	none
+*/
+void WelsExchangeSpatialPictures( SPicture **ppPic1, SPicture **ppPic2 )
+{
+	SPicture *tmp	= *ppPic1;
+
+	assert( *ppPic1 != *ppPic2 );
+
+	*ppPic1 = *ppPic2;
+	*ppPic2 = tmp;	
+}
+
+} // namespace WelsSVCEnc
+
--- /dev/null
+++ b/codec/encoder/core/src/property.cpp
@@ -1,0 +1,149 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	property.c
+ *
+ * \brief	CODE name, library module and corresponding version are included
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <stdlib.h>
+#include "property.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross_platforms
+namespace WelsSVCEnc {
+#define WELS_CODE_NAME	"Wels"
+#define WELS_LIB_NAME	"Encoder"
+
+#define WELS_VERSION_INT	0x000001	// v 0.0.1
+#define WELS_VERSION_STR	"0.0.1"
+
+#define WELS_BUILD_NUM		"090420"	// yymmdd
+
+//////////////summary information//////////////
+
+#define WELS_IDENT		WELS_CODE_NAME WELS_LIB_NAME "v" WELS_VERSION_STR "b" WELS_BUILD_NUM
+
+/*!
+ * \brief	get code name
+ * \param	pBuf	pBuffer to restore code name
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetCodeName(str_t *pBuf, int32_t iSize)
+{
+	int32_t iLen = 0;
+	
+	if ( NULL == pBuf )
+		return 0;
+	
+	iLen = STRNLEN( WELS_CODE_NAME, 4 );	// confirmed_safe_unsafe_usage
+	if ( iSize <= iLen )
+		return 0;
+
+	pBuf[iLen]	= '\0';
+	STRNCPY( pBuf, iSize, WELS_CODE_NAME, iLen);	// confirmed_safe_unsafe_usage
+
+	return iLen;
+}
+
+/*!
+ * \brief	get library/module name
+ * \param	pBuf	pBuffer to restore module name
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetLibName(str_t *pBuf, int32_t iSize)
+{
+	int32_t iLen = 0;
+
+	if ( NULL == pBuf )
+		return 0;
+
+	iLen	= STRNLEN( WELS_LIB_NAME, 7 );	// confirmed_safe_unsafe_usage
+	if ( iSize <= iLen )
+		return 0;
+
+	pBuf[iLen]	= '\0';
+	STRNCPY( pBuf, iSize, WELS_LIB_NAME, iLen );	// confirmed_safe_unsafe_usage
+
+	return iLen;
+}
+
+/*!
+ * \brief	get version number
+ * \param	pBuf	pBuffer to restore version number
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetVerNum(str_t *pBuf, int32_t iSize)
+{
+	int32_t iLen = 0;
+	
+	if ( NULL == pBuf )
+		return 0;
+	
+	iLen	= STRNLEN( WELS_VERSION_STR, 5 );	// confirmed_safe_unsafe_usage
+	if ( iSize <= iLen )
+		return 0;
+	
+	pBuf[iLen]	= '\0';
+	STRNCPY( pBuf, iSize, WELS_VERSION_STR, iLen );	// confirmed_safe_unsafe_usage
+	
+	return iLen;
+}
+
+/*!
+ * \brief	get identify information
+ * \param	pBuf	pBuffer to restore indentify information
+ * \param	iSize	size of pBuffer overall
+ * \return	actual size of pBuffer used; 0 returned in failure
+ */
+int32_t GetIdentInfo(str_t *pBuf, int32_t iSize)
+{
+	int32_t iLen = 0;
+	
+	if ( NULL == pBuf )
+		return 0;
+	
+	iLen	= STRNLEN( WELS_IDENT, 30 );	// confirmed_safe_unsafe_usage
+	if ( iSize <= iLen )
+		return 0;
+	
+	pBuf[iLen]	= '\0';
+	STRNCPY( pBuf, iSize, WELS_IDENT, iLen );	// confirmed_safe_unsafe_usage
+	
+	return iLen;
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/ratectl.cpp
@@ -1,0 +1,1049 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  ratectl.c
+ *
+ *  Abstract
+ *      Rate Control
+ *
+ *  History
+ *      9/8/2009 Created
+ *    12/26/2011 Modified
+ *  
+ *
+ *
+ *************************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "rc.h"
+#include "encoder_context.h"
+#include "utils.h"
+#include "svc_enc_golomb.h"
+
+
+namespace WelsSVCEnc {
+
+//#define _TEST_TEMP_RC_
+#ifdef _TEST_TEMP_RC_
+//#define _NOT_USE_AQ_FOR_TEST_
+FILE *fp_test_rc = NULL;
+FILE *fp_vgop = NULL;
+#endif
+#define _BITS_RANGE 0
+
+void RcInitLayerMemory(SWelsSvcRc *pWelsSvcRc, CMemoryAlign *pMA, const int32_t kiMaxTl)
+{
+	const int32_t kiSliceNum			= pWelsSvcRc->iSliceNum;
+	const int32_t kiGomSize				= pWelsSvcRc->iGomSize;
+	const int32_t kiGomSizeD			= kiGomSize * sizeof(double);
+	const int32_t kiGomSizeI			= kiGomSize * sizeof(int32_t);
+	const int32_t kiLayerRcSize			= kiGomSizeD + (kiGomSizeI*3) + sizeof(SRCSlicing)*kiSliceNum + sizeof(SRCTemporal)*kiMaxTl;	
+	uint8_t *pBaseMem					= (uint8_t *)pMA->WelsMalloc(kiLayerRcSize, "rc_layer_memory");
+
+	if (NULL == pBaseMem)
+		return;	
+
+	pWelsSvcRc->pGomComplexity				= (double *)pBaseMem;
+	pBaseMem += kiGomSizeD;
+	pWelsSvcRc->pGomForegroundBlockNum	= (int32_t *)pBaseMem;
+	pBaseMem += kiGomSizeI;
+	pWelsSvcRc->pCurrentFrameGomSad		= (int32_t *)pBaseMem;
+	pBaseMem += kiGomSizeI;
+	pWelsSvcRc->pGomCost					= (int32_t *)pBaseMem;
+	pBaseMem += kiGomSizeI;
+	pWelsSvcRc->pSlicingOverRc			= (SRCSlicing *)pBaseMem;
+	pBaseMem += sizeof(SRCSlicing)*kiSliceNum;
+	pWelsSvcRc->pTemporalOverRc			= (SRCTemporal *)pBaseMem;
+}
+
+void RcFreeLayerMemory(SWelsSvcRc *pWelsSvcRc, CMemoryAlign *pMA)
+{
+	if (pWelsSvcRc != NULL && pWelsSvcRc->pGomComplexity != NULL)
+	{
+		pMA->WelsFree(pWelsSvcRc->pGomComplexity, "rc_layer_memory");
+		pWelsSvcRc->pGomComplexity			= NULL;
+		pWelsSvcRc->pGomForegroundBlockNum	= NULL;
+		pWelsSvcRc->pCurrentFrameGomSad	= NULL;
+		pWelsSvcRc->pGomCost				= NULL;
+		pWelsSvcRc->pSlicingOverRc			= NULL;
+		pWelsSvcRc->pTemporalOverRc		= NULL;
+	}
+}
+
+static inline double RcConvertQp2QStep(double dQP)
+{	
+	return pow( 2.0, (dQP-4.0)/6.0 );
+}
+static inline double RcConvertQStep2Qp(double dQpStep)
+{
+	return (6 * log(dQpStep) / log(2.0) + 4.0);
+}
+
+void RcInitSequenceParameter(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc = NULL;
+	SDLayerParam *pDLayerParam = NULL;
+
+	int32_t j = 0;
+	int32_t iMbWidth = 0;
+
+	BOOL_T bMultiSliceMode = FALSE;
+	int32_t iGomRowMode0 = 1, iGomRowMode1 = 1;
+#ifdef _TEST_TEMP_RC_
+	fp_test_rc = fopen("testRC.dat","w");
+	fp_vgop = fopen("vgop.dat","w");
+#endif
+	for( j=0; j<pEncCtx->pSvcParam->iNumDependencyLayer; j++ )
+	{
+		SSliceCtx *pSliceCtx = &pEncCtx->pSliceCtxList[j];
+		pWelsSvcRc  = &pEncCtx->pWelsSvcRc[j];
+		pDLayerParam = &pEncCtx->pSvcParam->sDependencyLayers[j];
+		iMbWidth     = (pDLayerParam->iFrameWidth>>4);
+		pWelsSvcRc->iNumberMbFrame = iMbWidth*(pDLayerParam->iFrameHeight>>4);
+		pWelsSvcRc->iSliceNum= pSliceCtx->iSliceNumInFrame;
+
+		pWelsSvcRc->iRcVaryPercentage = _BITS_RANGE;	// % -- for temp
+		pWelsSvcRc->dRcVaryRatio = (double)pWelsSvcRc->iRcVaryPercentage/MAX_BITS_VARY_PERCENTAGE;
+
+		pWelsSvcRc->dSkipBufferRatio  = SKIP_RATIO;
+
+		pWelsSvcRc->iQpRangeUpperInFrame = QP_RANGE_UPPER_MODE1 - (int32_t)((QP_RANGE_UPPER_MODE1 - QP_RANGE_MODE0)*pWelsSvcRc->dRcVaryRatio + 0.5);
+		pWelsSvcRc->iQpRangeLowerInFrame = QP_RANGE_LOWER_MODE1 - (int32_t)((QP_RANGE_LOWER_MODE1 - QP_RANGE_MODE0)*pWelsSvcRc->dRcVaryRatio + 0.5);
+
+		if( iMbWidth<=MB_WIDTH_THRESHOLD_90P )
+		{
+			pWelsSvcRc->iSkipQpValue = SKIP_QP_90P;
+			iGomRowMode0 = GOM_ROW_MODE0_90P;
+			iGomRowMode1 = GOM_ROW_MODE1_90P;
+		}
+		else if( iMbWidth<=MB_WIDTH_THRESHOLD_180P )
+		{
+			pWelsSvcRc->iSkipQpValue = SKIP_QP_180P;
+			iGomRowMode0 = GOM_ROW_MODE0_180P;
+			iGomRowMode1 = GOM_ROW_MODE1_180P;
+		}
+		else if( iMbWidth<=MB_WIDTH_THRESHOLD_360P )
+		{
+			pWelsSvcRc->iSkipQpValue = SKIP_QP_360P;
+			iGomRowMode0 = GOM_ROW_MODE0_360P;
+			iGomRowMode1 = GOM_ROW_MODE1_360P;
+		}
+		else
+		{
+			pWelsSvcRc->iSkipQpValue = SKIP_QP_720P;
+			iGomRowMode0 = GOM_ROW_MODE0_720P;
+			iGomRowMode1 = GOM_ROW_MODE1_720P;				
+		}
+		iGomRowMode0 = iGomRowMode1 + (int32_t)((iGomRowMode0 - iGomRowMode1)*pWelsSvcRc->dRcVaryRatio + 0.5);
+
+		pWelsSvcRc->iNumberMbGom   = iMbWidth*iGomRowMode0;
+
+		pWelsSvcRc->iMinQp = GOM_MIN_QP_MODE;
+		pWelsSvcRc->iMaxQp = GOM_MAX_QP_MODE;
+		
+		pWelsSvcRc->iFrameDeltaQpUpper = LAST_FRAME_QP_RANGE_UPPER_MODE1 - (int32_t)((LAST_FRAME_QP_RANGE_UPPER_MODE1 - LAST_FRAME_QP_RANGE_UPPER_MODE0)*pWelsSvcRc->dRcVaryRatio + 0.5);
+		pWelsSvcRc->iFrameDeltaQpLower = LAST_FRAME_QP_RANGE_LOWER_MODE1 - (int32_t)((LAST_FRAME_QP_RANGE_LOWER_MODE1 - LAST_FRAME_QP_RANGE_LOWER_MODE0)*pWelsSvcRc->dRcVaryRatio + 0.5);
+
+		pWelsSvcRc->iSkipFrameNum = 0;
+		pWelsSvcRc->iGomSize = (pWelsSvcRc->iNumberMbFrame+pWelsSvcRc->iNumberMbGom-1)/pWelsSvcRc->iNumberMbGom;
+	
+
+		RcInitLayerMemory( pWelsSvcRc, pEncCtx->pMemAlign, 1+pDLayerParam->iHighestTemporalId );
+
+		bMultiSliceMode	= ( (SM_RASTER_SLICE == pDLayerParam->sMso.uiSliceMode) || 
+			(SM_ROWMB_SLICE	 == pDLayerParam->sMso.uiSliceMode) || 
+			(SM_DYN_SLICE	 == pDLayerParam->sMso.uiSliceMode)	);
+		if( bMultiSliceMode )
+			pWelsSvcRc->iNumberMbGom = pWelsSvcRc->iNumberMbFrame;
+	}
+}
+
+
+void RcInitTlWeight(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCTemporal *pTOverRc	= pWelsSvcRc->pTemporalOverRc;
+	SDLayerParam *pDLayerParam =  &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+	const int32_t kiDecompositionStages = pDLayerParam->iDecompositionStages;
+	const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;
+
+	//Index 0:Virtual GOP size, Index 1:Frame rate
+	double WeightArray[4][4] = { {1.0, 0, 0, 0}, {0.6, 0.4, 0, 0}, {0.4, 0.3, 0.15, 0}, {0.25, 0.15, 0.125, 0.0875}};
+	const int32_t kiGopSize = (1<<kiDecompositionStages);
+	int32_t i, k, n;
+
+	n = 0;
+	while (n <= kiHighestTid)
+	{
+		pTOverRc[n].dTlayerWeight	= WeightArray[kiDecompositionStages][n];
+		++ n;
+	}
+	//Calculate the frame index for the current frame and its reference frame
+	for( n=0; n<VGOP_SIZE; n+=kiGopSize )
+	{
+		pWelsSvcRc->iTlOfFrames[n] = 0;
+		for( i=1; i<=kiDecompositionStages; i++ )
+		{
+			for( k=1<<(kiDecompositionStages-i); k<kiGopSize; k+=(kiGopSize>>(i-1)) )
+			{
+				pWelsSvcRc->iTlOfFrames[k+n]=i;
+			}
+		}
+	}
+	pWelsSvcRc->iPreviousGopSize = kiGopSize;
+	pWelsSvcRc->iGopNumberInVGop = VGOP_SIZE/kiGopSize;
+}
+
+void RcUpdateBitrateFps(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc	= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCTemporal *pTOverRc		= pWelsSvcRc->pTemporalOverRc;
+	SDLayerParam *pDLayerParam     = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+	const int32_t kiGopSize	= (1<<pDLayerParam->iDecompositionStages);	
+	const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;	
+	double input_dBitsPerFrame = pDLayerParam->iSpatialBitrate / pDLayerParam->fInputFrameRate;
+	const int32_t kiGopBits	= (int32_t)(input_dBitsPerFrame*kiGopSize);
+	int32_t i;
+
+	pWelsSvcRc->iBitRate   = pDLayerParam->iSpatialBitrate; 
+	pWelsSvcRc->fFrameRate = pDLayerParam->fInputFrameRate;	
+	
+	double dTargetVaryRange = FRAME_iTargetBits_VARY_RANGE*(1.0 - pWelsSvcRc->dRcVaryRatio);
+	double dMinBitsRatio = 1.0 - dTargetVaryRange;
+	double dMaxBitsRatio = 1.0 + FRAME_iTargetBits_VARY_RANGE;//dTargetVaryRange;
+
+	for( i=0; i<=kiHighestTid; i++)
+	{
+		const double kdConstraitBits = kiGopBits*pTOverRc[i].dTlayerWeight;	
+		pTOverRc[i].iMinBitsTl = (int32_t)(kdConstraitBits*dMinBitsRatio);
+		pTOverRc[i].iMaxBitsTl = (int32_t)(kdConstraitBits*dMaxBitsRatio);
+	}
+	//When bitrate is changed, pBuffer size should be updated
+	pWelsSvcRc->iBufferSizeSkip = (int32_t)(pWelsSvcRc->iBitRate * pWelsSvcRc->dSkipBufferRatio);
+	pWelsSvcRc->iBufferSizePadding = (int32_t)(pWelsSvcRc->iBitRate * PADDING_BUFFER_RATIO);
+
+	//change remaining bits
+	if(pWelsSvcRc->dBitsPerFrame > 0.1)
+		pWelsSvcRc->iRemainingBits = (int32_t)(pWelsSvcRc->iRemainingBits*input_dBitsPerFrame/pWelsSvcRc->dBitsPerFrame);
+	pWelsSvcRc->dBitsPerFrame = input_dBitsPerFrame;
+}
+
+
+void RcInitVGop(sWelsEncCtx *pEncCtx)
+{
+	const int32_t kiDid		= pEncCtx->uiDependencyId;
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[kiDid];
+	SRCTemporal *pTOverRc		= pWelsSvcRc->pTemporalOverRc;
+	const int32_t kiHighestTid = pEncCtx->pSvcParam->sDependencyLayers[kiDid].iHighestTemporalId;
+
+	pWelsSvcRc->iRemainingBits = (int32_t)(VGOP_SIZE*pWelsSvcRc->dBitsPerFrame);
+	pWelsSvcRc->dRemainingWeights = pWelsSvcRc->iGopNumberInVGop;
+
+	pWelsSvcRc->iFrameCodedInVGop = 0;
+	pWelsSvcRc->iGopIndexInVGop = 0;
+
+	for (int32_t i = 0; i <= kiHighestTid; ++ i)
+		pTOverRc[i].iGopBitsDq = 0;
+	pWelsSvcRc->iSkipFrameInVGop=0;
+}
+
+void RcInitRefreshParameter(sWelsEncCtx *pEncCtx)
+{
+	const int32_t kiDid		  = pEncCtx->uiDependencyId;
+	SWelsSvcRc *pWelsSvcRc   = &pEncCtx->pWelsSvcRc[kiDid];
+	SRCTemporal *pTOverRc		  = pWelsSvcRc->pTemporalOverRc;
+	SDLayerParam *pDLayerParam       = &pEncCtx->pSvcParam->sDependencyLayers[kiDid];
+	const int32_t kiHighestTid = pDLayerParam->iHighestTemporalId;
+	int32_t i;
+
+	//I frame R-Q Model
+	pWelsSvcRc->iIntraComplexity = 0;
+	pWelsSvcRc->iIntraMbCount = 0;
+
+	//P frame R-Q Model
+	for(i=0; i<=kiHighestTid; i++)
+	{
+		pTOverRc[i].iPFrameNum = 0;
+		pTOverRc[i].dLinearCmplx = 0.0;
+		pTOverRc[i].iFrameCmplxMean = 0;
+	}
+
+	pWelsSvcRc->iBufferFullnessSkip = 0;
+	pWelsSvcRc->iBufferFullnessPadding = 0;
+
+	pWelsSvcRc->iGopIndexInVGop = 0;
+	pWelsSvcRc->iRemainingBits = 0;
+	pWelsSvcRc->dBitsPerFrame	= 0.0;
+
+	//Backup the initial bitrate and fps
+	pWelsSvcRc->iPreviousBitrate  = pDLayerParam->iSpatialBitrate;
+	pWelsSvcRc->dPreviousFps      = pDLayerParam->fInputFrameRate;	
+
+	memset( pWelsSvcRc->pCurrentFrameGomSad, 0, pWelsSvcRc->iGomSize*sizeof(int32_t) );
+
+	RcInitTlWeight(pEncCtx);
+	RcUpdateBitrateFps(pEncCtx);
+	RcInitVGop(pEncCtx);
+}
+
+bool_t RcJudgeBitrateFpsUpdate(sWelsEncCtx *pEncCtx)
+{
+	int32_t iCurDid = pEncCtx->uiDependencyId;
+	SWelsSvcRc *pWelsSvcRc       = &pEncCtx->pWelsSvcRc[iCurDid];
+	SDLayerParam *pDLayerParam    = &pEncCtx->pSvcParam->sDependencyLayers[iCurDid];
+
+	if((pWelsSvcRc->iPreviousBitrate != pDLayerParam->iSpatialBitrate) ||
+		(pWelsSvcRc->dPreviousFps-pDLayerParam->fInputFrameRate)>EPSN ||
+		(pWelsSvcRc->dPreviousFps-pDLayerParam->fInputFrameRate)<-EPSN)
+	{
+		pWelsSvcRc->iPreviousBitrate = pDLayerParam->iSpatialBitrate;
+		pWelsSvcRc->dPreviousFps = pDLayerParam->fInputFrameRate;
+		return true;
+	}
+	else
+		return false;
+}
+
+#if GOM_TRACE_FLAG
+void RcTraceVGopBitrate(sWelsEncCtx *pEncCtx)
+{
+	const int32_t kiDid				= pEncCtx->uiDependencyId;
+	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[kiDid];
+
+	if( pWelsSvcRc->iFrameCodedInVGop )
+	{
+		const int32_t kiHighestTid	= pEncCtx->pSvcParam->sDependencyLayers[kiDid].iHighestTemporalId;
+		SRCTemporal *pTOverRc			= pWelsSvcRc->pTemporalOverRc;
+		int32_t iVGopBitrate;
+		int32_t	iTotalBits = pWelsSvcRc->iPaddingBitrateStat;
+		int32_t iTid = 0;
+		while (iTid <= kiHighestTid)
+		{
+			iTotalBits += pTOverRc[iTid].iGopBitsDq;
+			++ iTid;
+		}
+		int32_t iFrameInVGop = pWelsSvcRc->iFrameCodedInVGop+pWelsSvcRc->iSkipFrameInVGop;
+		if(0 != iFrameInVGop)			
+			iVGopBitrate = (int32_t)( iTotalBits/iFrameInVGop *pWelsSvcRc->fFrameRate );
+#ifdef _TEST_TEMP_Rc_
+		fprintf(fp_vgop,"%d\n",(int32_t)((double)iTotalBits/iFrameInVGop));
+#endif
+		WelsLog( pEncCtx, WELS_LOG_INFO,"[Rc] VGOPbitrate%d: %d \n", kiDid, iVGopBitrate);
+		if ( iTotalBits > 0 )
+		{
+			iTid = 0;
+			while (iTid <= kiHighestTid)
+			{
+				WelsLog( pEncCtx, WELS_LOG_INFO,"T%d=%8.3f \n", iTid, (double)(pTOverRc[iTid].iGopBitsDq/iTotalBits) );
+				++ iTid;
+			}			
+		}		
+	}
+}
+#endif
+
+void RcUpdateTemporalZero(sWelsEncCtx *pEncCtx)
+{
+	const int32_t kiDid		= pEncCtx->uiDependencyId;
+	SWelsSvcRc *pWelsSvcRc	= &pEncCtx->pWelsSvcRc[kiDid];
+	SDLayerParam *pDLayerParam		= &pEncCtx->pSvcParam->sDependencyLayers[kiDid];
+	const int32_t kiGopSize	= (1<<pDLayerParam->iDecompositionStages);
+
+	if( pWelsSvcRc->iPreviousGopSize  != kiGopSize )
+	{
+#if GOM_TRACE_FLAG
+		RcTraceVGopBitrate(pEncCtx);
+#endif
+		RcInitTlWeight(pEncCtx);
+		RcInitVGop(pEncCtx);		
+	}
+	else if( pWelsSvcRc->iGopIndexInVGop == pWelsSvcRc->iGopNumberInVGop || pEncCtx->eSliceType == I_SLICE)
+	{
+#if GOM_TRACE_FLAG
+		RcTraceVGopBitrate(pEncCtx);
+#endif
+		RcInitVGop(pEncCtx);
+	}
+	pWelsSvcRc->iGopIndexInVGop++;
+}
+
+
+void RcInitIdrQp(sWelsEncCtx *pEncCtx)
+{
+	double dBpp = 0;
+	int32_t i;
+
+	//64k@6fps for 90p:     bpp 0.74    QP:24
+	//192k@12fps for 180p:  bpp 0.28    QP:26
+	//512k@24fps for 360p:  bpp 0.09    QP:30
+	//1500k@30fps for 720p: bpp 0.05    QP:32
+	double dBppArray[4][3] = {{0.5, 0.75, 1.0}, {0.2, 0.3, 0.4}, {0.05, 0.09, 0.13}, {0.03, 0.06, 0.1}};
+	int32_t dInitialQPArray[4][4] = {{28, 26, 24, 22}, {30, 28, 26, 24}, {32, 30, 28, 26}, {34, 32, 30, 28}};
+	int32_t iBppIndex = 0;
+
+	SWelsSvcRc *pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SDLayerParam *pDLayerParam			= &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+
+	if (pDLayerParam->fOutputFrameRate > EPSN && pDLayerParam->iFrameWidth && pDLayerParam->iFrameHeight)
+		dBpp=(double)(pDLayerParam->iSpatialBitrate) / (double)(pDLayerParam->fOutputFrameRate * pDLayerParam->iFrameWidth * pDLayerParam->iFrameHeight);
+	else
+		dBpp = 0.1;
+
+	//Area*2
+	if ( pDLayerParam->iFrameWidth*pDLayerParam->iFrameHeight <= 28800 ) // 90p video:160*90
+		iBppIndex = 0;
+	else if ( pDLayerParam->iFrameWidth*pDLayerParam->iFrameHeight <= 115200 ) // 180p video:320*180
+		iBppIndex = 1;
+	else if ( pDLayerParam->iFrameWidth*pDLayerParam->iFrameHeight <= 460800 ) // 360p video:640*360
+		iBppIndex = 2;
+	else
+		iBppIndex = 3;
+
+	//Search
+	for( i=0; i<3; i++ )
+	{
+		if ( dBpp<=dBppArray[iBppIndex][i] )
+			break;
+	}
+	pWelsSvcRc->iInitialQp = dInitialQPArray[iBppIndex][i];
+	pWelsSvcRc->iInitialQp = (int32_t)WELS_CLIP3( pWelsSvcRc->iInitialQp, MIN_IDR_QP, MAX_IDR_QP );
+	pEncCtx->iGlobalQp = pWelsSvcRc->iInitialQp;
+	pWelsSvcRc->dQStep = RcConvertQp2QStep(pEncCtx->iGlobalQp);
+	pWelsSvcRc->iLastCalculatedQScale = pEncCtx->iGlobalQp;
+}
+
+void RcCalculateIdrQp(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	//obtain the idr qp using previous idr complexity
+	if(pWelsSvcRc->iNumberMbFrame != pWelsSvcRc->iIntraMbCount){
+		pWelsSvcRc->iIntraComplexity = (int32_t)((double)pWelsSvcRc->iIntraComplexity*pWelsSvcRc->iNumberMbFrame/pWelsSvcRc->iIntraMbCount + 0.5);		
+	}
+	pWelsSvcRc->iInitialQp = (int32_t)RcConvertQStep2Qp( (double)pWelsSvcRc->iIntraComplexity/pWelsSvcRc->iTargetBits);
+	pWelsSvcRc->iInitialQp = (int32_t)WELS_CLIP3( pWelsSvcRc->iInitialQp, MIN_IDR_QP, MAX_IDR_QP );
+	pEncCtx->iGlobalQp = pWelsSvcRc->iInitialQp;
+	pWelsSvcRc->dQStep = RcConvertQp2QStep(pEncCtx->iGlobalQp);
+	pWelsSvcRc->iLastCalculatedQScale = pEncCtx->iGlobalQp;
+}
+
+
+void RcCalculatePictureQp(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	int32_t iTl					= pEncCtx->uiTemporalId;
+	SRCTemporal *pTOverRc			= &pWelsSvcRc->pTemporalOverRc[iTl];
+	int32_t iLumaQp = 0;
+
+	if(0 == pTOverRc->iPFrameNum)
+	{
+		iLumaQp = pWelsSvcRc->iInitialQp;	
+	}
+	else{
+		double dCmplxRatio = (double)pEncCtx->pVaa->sComplexityAnalysisParam.iFrameComplexity/pTOverRc->iFrameCmplxMean;
+		dCmplxRatio = WELS_CLIP3(dCmplxRatio, 1.0-FRAME_CMPLX_RATIO_RANGE, 1.0+FRAME_CMPLX_RATIO_RANGE);
+		
+		pWelsSvcRc->dQStep = pTOverRc->dLinearCmplx*dCmplxRatio / pWelsSvcRc->iTargetBits;
+		iLumaQp = (int32_t)( RcConvertQStep2Qp( pWelsSvcRc->dQStep )+0.5 );
+
+		//limit QP
+		int32_t iLastIdxCodecInVGop = pWelsSvcRc->iFrameCodedInVGop - 1;
+		if(iLastIdxCodecInVGop < 0)
+			iLastIdxCodecInVGop += VGOP_SIZE;
+		int32_t iTlLast = pWelsSvcRc->iTlOfFrames[iLastIdxCodecInVGop];
+		int32_t iDeltaQpTemporal = iTl - iTlLast;
+		if(0 == iTlLast && iTl > 0)
+			iDeltaQpTemporal += 3;
+		else if(0 == iTl && iTlLast > 0)
+			iDeltaQpTemporal -= 3;		
+
+		iLumaQp = WELS_CLIP3(iLumaQp,  
+			pWelsSvcRc->iLastCalculatedQScale - pWelsSvcRc->iFrameDeltaQpLower +iDeltaQpTemporal, pWelsSvcRc->iLastCalculatedQScale + pWelsSvcRc->iFrameDeltaQpUpper + iDeltaQpTemporal);
+	}
+
+	iLumaQp = WELS_CLIP3(iLumaQp,  GOM_MIN_QP_MODE, GOM_MAX_QP_MODE);
+
+	pWelsSvcRc->dQStep = RcConvertQp2QStep(iLumaQp);
+	pWelsSvcRc->iLastCalculatedQScale = iLumaQp;
+#ifndef _NOT_USE_AQ_FOR_TEST_
+	if(pEncCtx->pSvcParam->bEnableAdaptiveQuant)
+	{
+
+		iLumaQp = (int32_t)WELS_CLIP3(iLumaQp - pEncCtx->pVaa->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp, pWelsSvcRc->iMinQp, pWelsSvcRc->iMaxQp);
+	}
+#endif
+	pEncCtx->iGlobalQp = iLumaQp;
+}
+
+void RcInitSliceInformation(sWelsEncCtx *pEncCtx)
+{
+	SSliceCtx *pCurSliceCtx	= pEncCtx->pCurDqLayer->pSliceEncCtx;
+	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[0];
+	const int32_t kiSliceNum			= pCurSliceCtx->iSliceNumInFrame;
+	const double kdBitsPerMb		= (double)pWelsSvcRc->iTargetBits / pWelsSvcRc->iNumberMbFrame;
+
+	for(int32_t i=0; i<kiSliceNum; i++ )
+	{
+		pSOverRc->iStartMbSlice	=
+		pSOverRc->iEndMbSlice		= pCurSliceCtx->pFirstMbInSlice[i];
+		pSOverRc->iEndMbSlice		+= (pCurSliceCtx->pCountMbNumInSlice[i]-1);
+		pSOverRc->iTotalQpSlice	= 0;
+		pSOverRc->iTotalMbSlice	= 0;
+		pSOverRc->iTargetBitsSlice = (int32_t)(kdBitsPerMb * pCurSliceCtx->pCountMbNumInSlice[i]);
+		pSOverRc->iFrameBitsSlice	= 0;
+		pSOverRc->iGomBitsSlice	= 0;
+		++ pSOverRc;
+	}
+}
+
+void RcDecideTargetBits(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc	= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCTemporal *pTOverRc		= &pWelsSvcRc->pTemporalOverRc[pEncCtx->uiTemporalId];		
+	//allocate bits
+	if(pEncCtx->eSliceType == I_SLICE)
+	{
+		pWelsSvcRc->iTargetBits = (int32_t)( pWelsSvcRc->dBitsPerFrame * IDR_BITRATE_RATIO );
+	}
+	else
+	{
+		pWelsSvcRc->iTargetBits = (int32_t)( pWelsSvcRc->iRemainingBits*pTOverRc->dTlayerWeight/pWelsSvcRc->dRemainingWeights );
+		pWelsSvcRc->iTargetBits = WELS_CLIP3( pWelsSvcRc->iTargetBits, pTOverRc->iMinBitsTl,	pTOverRc->iMaxBitsTl);	
+	}
+	pWelsSvcRc->dRemainingWeights -= pTOverRc->dTlayerWeight;
+}
+
+
+void RcInitGoomParameters(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[0];
+	const int32_t kiSliceNum			= pWelsSvcRc->iSliceNum;
+	const int32_t kiGlobalQp			= pEncCtx->iGlobalQp;
+
+	pWelsSvcRc->iAverageFrameQp = 0;
+	for(int32_t i=0; i<kiSliceNum; ++i )
+	{
+		pSOverRc->iComplexityIndexSlice	= 0;
+		pSOverRc->iCalculatedQpSlice		= kiGlobalQp;		
+		++ pSOverRc;
+	}
+	memset( pWelsSvcRc->pGomComplexity, 0, pWelsSvcRc->iGomSize*sizeof(double) );
+	memset( pWelsSvcRc->pGomCost, 0, pWelsSvcRc->iGomSize*sizeof(int32_t) );
+}
+
+void RcCalculateMbQp(sWelsEncCtx *pEncCtx,SMB* pCurMb, const int32_t kiSliceId)
+{
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCSlicing *pSOverRc		= &pWelsSvcRc->pSlicingOverRc[kiSliceId];	
+	int32_t iLumaQp			= pSOverRc->iCalculatedQpSlice;
+
+#ifndef _NOT_USE_AQ_FOR_TEST_
+	if ( pEncCtx->pSvcParam->bEnableAdaptiveQuant )
+	{
+		iLumaQp   = (int8_t)WELS_CLIP3(iLumaQp + 
+		pEncCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[pCurMb->iMbXY], pWelsSvcRc->iMinQp, 51);
+	}
+#endif
+	pCurMb->uiChromaQp	= g_kuiChromaQpTable[iLumaQp];
+	pCurMb->uiLumaQp		= iLumaQp;
+}
+
+SWelsSvcRc* RcJudgeBaseUsability(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc  = NULL, *pWelsSvcRc_Base = NULL;
+	SDLayerParam *pDlpBase = NULL, *pDLayerParam = NULL;
+
+	if( pEncCtx->uiDependencyId<=0 )
+		return NULL;
+
+	pDlpBase = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId-1];
+	pWelsSvcRc_Base = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId-1];
+	if( pEncCtx->uiTemporalId<=pDlpBase->iDecompositionStages )
+	{
+		pWelsSvcRc      = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+		pWelsSvcRc_Base = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId-1];
+		pDLayerParam             = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+		pDlpBase        = &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId-1];
+		if( (pDLayerParam->iFrameWidth*pDLayerParam->iFrameHeight/pWelsSvcRc->iNumberMbGom) == 
+			(pDlpBase->iFrameWidth*pDlpBase->iFrameHeight/pWelsSvcRc_Base->iNumberMbGom) )
+			return pWelsSvcRc_Base;
+		else
+			return NULL;
+	}
+	else
+		return NULL;
+}
+
+void RcGomTargetBits(sWelsEncCtx *pEncCtx, const int32_t kiSliceId)
+{
+	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SWelsSvcRc *pWelsSvcRc_Base	= NULL;
+	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[kiSliceId];
+
+	double dAllocateBits = 0;
+	int32_t iSumSad = 0;
+	int32_t iLastGomIndex = 0;
+	int32_t iLeftBits = 0;
+	const int32_t kiComplexityIndex	= pSOverRc->iComplexityIndexSlice;
+	int32_t i;
+
+	iLastGomIndex  = pSOverRc->iEndMbSlice/pWelsSvcRc->iNumberMbGom;
+	iLeftBits = pSOverRc->iTargetBitsSlice-pSOverRc->iFrameBitsSlice;
+	
+	if(iLeftBits <= 0)
+	{
+		pSOverRc->iGomTargetBits = 0;
+		return;
+	}
+	else if( kiComplexityIndex >= iLastGomIndex)
+	{
+		dAllocateBits = iLeftBits;
+	}
+	else
+	{
+		pWelsSvcRc_Base = RcJudgeBaseUsability(pEncCtx);
+		pWelsSvcRc_Base = (pWelsSvcRc_Base) ? pWelsSvcRc_Base : pWelsSvcRc;		
+		for( i=kiComplexityIndex; i<=iLastGomIndex; i++ )
+		{
+			iSumSad += pWelsSvcRc_Base->pCurrentFrameGomSad[i];
+		}
+		if(0 == iSumSad)
+			dAllocateBits = (double)iLeftBits/(iLastGomIndex-kiComplexityIndex);
+		else
+			dAllocateBits = (double)iLeftBits*pWelsSvcRc_Base->pCurrentFrameGomSad[kiComplexityIndex+1]/iSumSad;
+		
+	}
+	pSOverRc->iGomTargetBits = int32_t(dAllocateBits + 0.5);
+}
+
+
+
+void RcCalculateGomQp(sWelsEncCtx *pEncCtx, SMB* pCurMb, int32_t iSliceId)
+{
+	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[iSliceId];
+	double dBitsRatio = 1.0;
+
+	int32_t iLeftBits = pSOverRc->iTargetBitsSlice - pSOverRc->iFrameBitsSlice;
+	int32_t iTargetLeftBits = iLeftBits + pSOverRc->iGomBitsSlice - pSOverRc->iGomTargetBits;
+	
+	if(iLeftBits <= 0)
+	{
+		pSOverRc->iCalculatedQpSlice += 2;
+	}
+	else
+	{
+		//globe decision
+		dBitsRatio = iLeftBits / (iTargetLeftBits + 0.1);		
+		if(dBitsRatio < 0.8409)		//2^(-1.5/6)
+			pSOverRc->iCalculatedQpSlice += 2;
+		else if(dBitsRatio < 0.9439)	//2^(-0.5/6)
+			pSOverRc->iCalculatedQpSlice += 1;
+		else if(dBitsRatio > 1.06)		//2^(0.5/6)
+			pSOverRc->iCalculatedQpSlice -= 1;
+		else if(dBitsRatio > 1.19)		//2^(1.5/6)
+			pSOverRc->iCalculatedQpSlice -= 2;
+	}
+
+	pSOverRc->iCalculatedQpSlice = WELS_CLIP3( pSOverRc->iCalculatedQpSlice, 
+		pEncCtx->iGlobalQp-pWelsSvcRc->iQpRangeLowerInFrame, pEncCtx->iGlobalQp+pWelsSvcRc->iQpRangeUpperInFrame );
+	pSOverRc->iCalculatedQpSlice = WELS_CLIP3(pSOverRc->iCalculatedQpSlice, pWelsSvcRc->iMinQp, pWelsSvcRc->iMaxQp);
+
+	pSOverRc->iGomBitsSlice = 0;
+
+}
+
+void   RcVBufferCalculationSkip(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCTemporal *pTOverRc		= pWelsSvcRc->pTemporalOverRc;
+	const int32_t kiOutputBits = (int32_t)(pWelsSvcRc->dBitsPerFrame + 0.5);
+	//condition 1: whole pBuffer fullness
+	pWelsSvcRc->iBufferFullnessSkip += (pWelsSvcRc->iFrameDqBits - kiOutputBits);
+	//condition 2: VGOP bits constraint
+	const int32_t kiVGopBits = (int32_t)(pWelsSvcRc->dBitsPerFrame * VGOP_SIZE);
+	int32_t iVGopBitsPred = 0;
+	for(int32_t i = pWelsSvcRc->iFrameCodedInVGop+1; i<VGOP_SIZE; i++ )
+		iVGopBitsPred += pTOverRc[pWelsSvcRc->iTlOfFrames[i]].iMinBitsTl;
+	iVGopBitsPred -= pWelsSvcRc->iRemainingBits;
+	double dIncPercent = iVGopBitsPred*100.0/kiVGopBits - (double)VGOP_BITS_PERCENTAGE_DIFF;
+	
+	if( (pWelsSvcRc->iBufferFullnessSkip > pWelsSvcRc->iBufferSizeSkip &&	pWelsSvcRc->iAverageFrameQp > pWelsSvcRc->iSkipQpValue)
+		|| (dIncPercent > pWelsSvcRc->iRcVaryPercentage))
+	{
+		pEncCtx->iSkipFrameFlag=1;
+		pWelsSvcRc->iBufferFullnessSkip = pWelsSvcRc->iBufferFullnessSkip-kiOutputBits;
+#ifdef FRAME_INFO_OUTPUT
+		fprintf(stderr, "skip one frame\n");
+#endif
+	}
+
+	if( pWelsSvcRc->iBufferFullnessSkip<0 )
+		pWelsSvcRc->iBufferFullnessSkip = 0;
+
+	if( pEncCtx->iSkipFrameFlag==1 )
+	{
+		pWelsSvcRc->iRemainingBits += (int32_t)(pWelsSvcRc->dBitsPerFrame + 0.5);
+		pWelsSvcRc->iSkipFrameNum++;
+		pWelsSvcRc->iSkipFrameInVGop++;
+	}
+}
+
+void RcVBufferCalculationPadding(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	const int32_t kiOutputBits = (int32_t)(pWelsSvcRc->dBitsPerFrame + 0.5);
+	const int32_t kiBufferThreshold = (int32_t)(PADDING_THRESHOLD*(-pWelsSvcRc->iBufferSizePadding));
+
+	pWelsSvcRc->iBufferFullnessPadding += (pWelsSvcRc->iFrameDqBits - kiOutputBits);
+
+	if( pWelsSvcRc->iBufferFullnessPadding < kiBufferThreshold )
+	{
+		pWelsSvcRc->iPaddingSize = -pWelsSvcRc->iBufferFullnessPadding;
+		pWelsSvcRc->iPaddingSize >>= 3;	// /8
+		pWelsSvcRc->iBufferFullnessPadding = 0;
+	}
+	else
+		pWelsSvcRc->iPaddingSize=0;
+}
+
+
+void RcTraceFrameBits(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+
+	WelsLog( pEncCtx, WELS_LOG_INFO,"[Rc] encoding_qp%d, qp = %3d, index = %8d, iTid = %1d, used = %8d, target = %8d, remaingbits = %8d\n",
+		pEncCtx->uiDependencyId, pWelsSvcRc->iAverageFrameQp, pEncCtx->uiFrameIdxRc, pEncCtx->uiTemporalId, pWelsSvcRc->iFrameDqBits,
+		pWelsSvcRc->iTargetBits,pWelsSvcRc->iRemainingBits);
+}
+
+void RcUpdatePictureQpBits(sWelsEncCtx *pEncCtx, int32_t iCodedBits)
+{
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SRCSlicing *pSOverRc		= &pWelsSvcRc->pSlicingOverRc[0];
+	SSliceCtx *pCurSliceCtx = pEncCtx->pCurDqLayer->pSliceEncCtx;
+	int32_t iTotalQp = 0, iTotalMb = 0;
+	int32_t i;
+
+	if(pEncCtx->eSliceType == P_SLICE)
+	{
+		for( i=0; i<pCurSliceCtx->iSliceNumInFrame; i++ )
+		{			
+			iTotalQp += pSOverRc->iTotalQpSlice;
+			iTotalMb += pSOverRc->iTotalMbSlice;
+			++ pSOverRc;
+		}
+		if(iTotalMb > 0)
+			pWelsSvcRc->iAverageFrameQp = (int32_t)(1.0*iTotalQp/iTotalMb+0.5);
+		else
+			pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
+	}
+	else
+	{
+		pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
+	}	
+	pWelsSvcRc->iFrameDqBits = iCodedBits;
+	pWelsSvcRc->pTemporalOverRc[pEncCtx->uiTemporalId].iGopBitsDq += pWelsSvcRc->iFrameDqBits;
+}
+
+void RcUpdateIntraComplexity(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	double iAlpha = 1.0/(1+pWelsSvcRc->iIdrNum);
+	if(iAlpha < 0.25) iAlpha = 0.25;
+	
+	double dIntraCmplx = pWelsSvcRc->dQStep*pWelsSvcRc->iFrameDqBits;
+	dIntraCmplx = (1.0-iAlpha)*pWelsSvcRc->iIntraComplexity + iAlpha*dIntraCmplx;
+	pWelsSvcRc->iIntraComplexity = (int32_t)(dIntraCmplx + 0.5);	
+	pWelsSvcRc->iIntraMbCount = pWelsSvcRc->iNumberMbFrame;
+
+	pWelsSvcRc->iIdrNum++;
+	if(pWelsSvcRc->iIdrNum > 255)
+		pWelsSvcRc->iIdrNum = 255;
+}
+
+void RcUpdateFrameComplexity(sWelsEncCtx *pEncCtx)
+{
+	SWelsSvcRc *pWelsSvcRc		= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	const int32_t kiTl			= pEncCtx->uiTemporalId;
+	SRCTemporal *pTOverRc			= &pWelsSvcRc->pTemporalOverRc[kiTl];
+
+	if(0 == pTOverRc->iPFrameNum){
+		pTOverRc->dLinearCmplx = pWelsSvcRc->iFrameDqBits * pWelsSvcRc->dQStep;
+	}
+	else{
+		pTOverRc->dLinearCmplx = LINEAR_MODEL_DECAY_FACTOR*pTOverRc->dLinearCmplx 
+			+ (1.0-LINEAR_MODEL_DECAY_FACTOR)*(pWelsSvcRc->iFrameDqBits * pWelsSvcRc->dQStep);
+	}
+	double iAlpha = 1.0/(1+pTOverRc->iPFrameNum);
+	if(iAlpha < SMOOTH_FACTOR_MIN_VALUE)
+		iAlpha = SMOOTH_FACTOR_MIN_VALUE;
+	pTOverRc->iFrameCmplxMean = (int32_t)((1.0-iAlpha)*pTOverRc->iFrameCmplxMean + iAlpha*pEncCtx->pVaa->sComplexityAnalysisParam.iFrameComplexity + 0.5);
+
+	pTOverRc->iPFrameNum++;
+	if(pTOverRc->iPFrameNum > 255)
+		pTOverRc->iPFrameNum = 255;
+}
+
+int32_t RcCalculateCascadingQp(struct TagWelsEncCtx *pEncCtx, int32_t iQp)
+{
+	int32_t iTemporalQp = 0;
+	if( pEncCtx->pSvcParam->iDecompStages )
+	{
+		if( pEncCtx->uiTemporalId==0 )
+			iTemporalQp = iQp - 3 - (pEncCtx->pSvcParam->iDecompStages-1);
+		else
+			iTemporalQp = iQp - (pEncCtx->pSvcParam->iDecompStages - pEncCtx->uiTemporalId);
+		iTemporalQp = WELS_CLIP3( iTemporalQp, 1, 51 );
+	}
+	else
+		iTemporalQp = iQp;
+	return iTemporalQp;
+}
+
+void  WelsRcPictureInitGom(void *pCtx)
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+
+	if ( pEncCtx->eSliceType == I_SLICE )
+	{
+		if(0 == pWelsSvcRc->iIdrNum)	//iIdrNum == 0 means encoder has been initialed
+		{
+			RcInitRefreshParameter(pEncCtx);
+		}
+	}
+	if( RcJudgeBitrateFpsUpdate(pEncCtx))
+	{
+		RcUpdateBitrateFps(pEncCtx);
+	}
+	if( pEncCtx->uiTemporalId == 0 )
+	{
+		RcUpdateTemporalZero(pEncCtx);
+	}
+	RcDecideTargetBits(pEncCtx);
+	//decide globe_qp
+	if(pEncCtx->eSliceType == I_SLICE)
+	{
+		if(0 == pWelsSvcRc->iIdrNum)
+			RcInitIdrQp(pEncCtx);
+		else
+		{
+			RcCalculateIdrQp(pEncCtx);	
+		}
+	}
+	else
+	{
+		RcCalculatePictureQp(pEncCtx);
+	}
+	RcInitSliceInformation(pEncCtx);
+	RcInitGoomParameters(pEncCtx);
+
+}
+
+
+
+void  WelsRcPictureInfoUpdateGom(void *pCtx, int32_t layer_size)
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	int32_t iCodedBits = (layer_size<<3);
+
+	RcUpdatePictureQpBits(pEncCtx, iCodedBits);
+
+	if ( pEncCtx->eSliceType == P_SLICE )
+	{		
+		RcUpdateFrameComplexity(pEncCtx);
+	}
+	else
+	{
+		RcUpdateIntraComplexity(pEncCtx);
+	}
+	pWelsSvcRc->iRemainingBits -= pWelsSvcRc->iFrameDqBits;	
+
+#if GOM_TRACE_FLAG
+	RcTraceFrameBits(pEncCtx);
+#endif
+
+	
+#if SKIP_FRAME_FLAG
+	if ( pEncCtx->uiDependencyId == pEncCtx->pSvcParam->iNumDependencyLayer - 1 )
+	{
+		RcVBufferCalculationSkip(pEncCtx);
+	}
+#endif
+
+	if ( pEncCtx->pSvcParam->iPaddingFlag )
+		RcVBufferCalculationPadding(pEncCtx);
+	pWelsSvcRc->iFrameCodedInVGop++;
+#ifdef _TEST_TEMP_Rc_	
+	fprintf(fp_test_rc, "%d\n", pWelsSvcRc->iFrameDqBits);
+	if(pEncCtx->iSkipFrameFlag)
+		fprintf(fp_test_rc, "0\n");	
+	fflush(fp_test_rc);
+#endif
+}
+
+void WelsRcMbInitGom(void *pCtx, SMB* pCurMb, SSlice *pSlice)
+{	
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;	
+	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	const int32_t kiSliceId			= pSlice->uiSliceIdx;
+	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[kiSliceId];
+	SBitStringAux * bs				= pSlice->pSliceBsa;
+
+
+	pSOverRc->iBsPosSlice = BsGetBitsPos(bs);
+
+	if(pEncCtx->eSliceType==I_SLICE)
+		return;
+	//calculate gom qp and target bits at the beginning of gom
+	if(0 == (pCurMb->iMbXY%pWelsSvcRc->iNumberMbGom)){
+		if(pCurMb->iMbXY != pSOverRc->iStartMbSlice){
+			pSOverRc->iComplexityIndexSlice++;
+			RcCalculateGomQp(pEncCtx, pCurMb, kiSliceId);			
+		}
+		RcGomTargetBits(pEncCtx, kiSliceId);
+	}
+
+	RcCalculateMbQp(pEncCtx,pCurMb,kiSliceId);
+}
+
+void WelsRcMbInfoUpdateGom(void *pCtx, SMB* pCurMb, int32_t iCostLuma, SSlice *pSlice)
+{	
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	SWelsSvcRc *pWelsSvcRc			= &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];	
+	SBitStringAux * bs				= pSlice->pSliceBsa;
+	int32_t iSliceId				= pSlice->uiSliceIdx;
+	SRCSlicing *pSOverRc				= &pWelsSvcRc->pSlicingOverRc[iSliceId];	
+	const int32_t kiComplexityIndex	= pSOverRc->iComplexityIndexSlice;
+	
+	int32_t cur_mb_bits = BsGetBitsPos(bs) - pSOverRc->iBsPosSlice;
+	pSOverRc->iFrameBitsSlice += cur_mb_bits;
+	pSOverRc->iGomBitsSlice += cur_mb_bits;
+
+	pWelsSvcRc->pGomCost[kiComplexityIndex] += iCostLuma;
+
+	if(cur_mb_bits > 0){
+		pSOverRc->iTotalQpSlice += pCurMb->uiLumaQp;
+		pSOverRc->iTotalMbSlice++;
+	}
+}
+
+void  WelsRcPictureInitDisable(void *pCtx)
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	SWelsSvcRc *pWelsSvcRc = &pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId];
+	SDLayerParam *pDLayerParam		= &pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId];
+
+	const int32_t kiQp = pDLayerParam->iDLayerQp;
+
+	pEncCtx->iGlobalQp	= RcCalculateCascadingQp( pEncCtx, kiQp );
+
+	if ( pEncCtx->pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE) )
+	{
+		pEncCtx->iGlobalQp = (int32_t)WELS_CLIP3(pEncCtx->iGlobalQp - 
+			pEncCtx->pVaa->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp, GOM_MIN_QP_MODE, GOM_MAX_QP_MODE);
+	}
+	pWelsSvcRc->iAverageFrameQp = pEncCtx->iGlobalQp;
+}
+
+void  WelsRcPictureInfoUpdateDisable(void *pCtx, int32_t layer_size)
+{
+}
+
+void  WelsRcMbInitDisable(void *pCtx, SMB* pCurMb, SSlice *pSlice)
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	int32_t iLumaQp					= pEncCtx->iGlobalQp;
+
+	if ( pEncCtx->pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE) )
+	{
+		iLumaQp   = (int8_t)WELS_CLIP3(iLumaQp + 
+			pEncCtx->pVaa->sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[pCurMb->iMbXY], GOM_MIN_QP_MODE, 51);
+	}
+	pCurMb->uiChromaQp = g_kuiChromaQpTable[iLumaQp];
+	pCurMb->uiLumaQp = iLumaQp;
+}
+
+void  WelsRcMbInfoUpdateDisable(void *pCtx, SMB* pCurMb, int32_t iCostLuma, SSlice *pSlice)
+{
+}
+
+
+void  WelsRcInitModule(void *pCtx,  int32_t iModule)
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	SWelsRcFunc  * pRcf = &pEncCtx->pFuncList->pfRc;
+
+	switch(iModule)
+	{
+	case WELS_RC_DISABLE:
+		pRcf->pfWelsRcPictureInit = WelsRcPictureInitDisable;
+		pRcf->pfWelsRcPictureInfoUpdate = WelsRcPictureInfoUpdateDisable;
+		pRcf->pfWelsRcMbInit = WelsRcMbInitDisable;
+		pRcf->pfWelsRcMbInfoUpdate = WelsRcMbInfoUpdateDisable;
+		break;
+	case WELS_RC_GOM:
+	default:
+		pRcf->pfWelsRcPictureInit = WelsRcPictureInitGom;
+		pRcf->pfWelsRcPictureInfoUpdate = WelsRcPictureInfoUpdateGom;
+		pRcf->pfWelsRcMbInit = WelsRcMbInitGom;
+		pRcf->pfWelsRcMbInfoUpdate = WelsRcMbInfoUpdateGom;			
+		break;
+	}
+
+	RcInitSequenceParameter(pEncCtx);
+}
+
+void  WelsRcFreeMemory(void *pCtx)
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	SWelsSvcRc *pWelsSvcRc = NULL;
+	int32_t i = 0;
+#ifdef _TEST_TEMP_Rc_
+	if(fp_test_rc)
+		fclose(fp_test_rc);
+	fp_test_rc = NULL;
+	if(fp_vgop)
+		fclose(fp_vgop);
+	fp_vgop = NULL;
+#endif
+	for( i=0; i<pEncCtx->pSvcParam->iNumDependencyLayer; i++ )
+	{
+		pWelsSvcRc  = &pEncCtx->pWelsSvcRc[i];
+		RcFreeLayerMemory(pWelsSvcRc, pEncCtx->pMemAlign);
+	}
+}
+
+}//end of namespace
--- /dev/null
+++ b/codec/encoder/core/src/ref_list_mgr_svc.cpp
@@ -1,0 +1,631 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// ref_list_mgr_svc.c
+#include "ref_list_mgr_svc.h"
+#include "encoder_context.h"
+#include "svc_enc_frame.h"
+#include "picture.h"
+#include "expand_pic.h"
+#include <assert.h>
+#include "utils.h"
+#include "extern.h"
+namespace WelsSVCEnc {
+/*
+ *	set picture as unreferenced
+ */
+void SetUnref( SPicture *pRef )
+{
+	if ( NULL != pRef )	{
+		pRef->iFramePoc		= -1;
+		pRef->iFrameNum		= -1;	
+		pRef->uiTemporalId	=
+		pRef->uiSpatialId		=
+		pRef->iLongTermPicNum = -1;
+		pRef->bIsLongRef	= false;
+		pRef->uiRecieveConfirmed = RECIEVE_FAILED;
+		pRef->iMarkFrameNum = -1;
+		pRef->bUsedAsRef	= false;
+	}
+}
+
+/*
+*	reset LTR marking , recovery ,feedback state to default
+*/
+void ResetLtrState(SLTRState* pLtr )
+{	
+	pLtr->bReceivedT0LostFlag	= FALSE;
+	pLtr->iLastRecoverFrameNum = 0;
+	pLtr->iLastCorFrameNumDec = -1;
+	pLtr->iCurFrameNumInDec = -1;
+
+	// LTR mark
+	pLtr->iLTRMarkMode = LTR_DIRECT_MARK;
+	pLtr->iLTRMarkSuccessNum = 0; //successful marked num
+	pLtr->bLTRMarkingFlag = FALSE;	//decide whether current frame marked as LTR
+	pLtr->bLTRMarkEnable = FALSE; //when LTR is confirmed and the interval is no smaller than the marking period
+	pLtr->iCurLtrIdx = 0;
+	pLtr->iLastLtrIdx = 0;
+	pLtr->uiLtrMarkInterval = 0;	
+
+	// LTR mark feedback
+	pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK ;
+	pLtr->iLtrMarkFbFrameNum = -1;
+}
+
+/*
+ *	reset reference picture list
+ */
+void WelsResetRefList( sWelsEncCtx *pCtx )
+{
+	SRefList *pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	int32_t i;
+	
+	for ( i = 0; i<MAX_SHORT_REF_COUNT+1;i++)
+		pRefList->pShortRefList[i] = NULL;
+	for ( i = 0; i<MAX_LONG_REF_COUNT+1;i++)
+		pRefList->pLongRefList[i] = NULL;
+	for ( i = 0; i<pCtx->pSvcParam->iNumRefFrame+1;i++)
+		SetUnref( pRefList->pRef[i] );
+
+	pRefList->uiLongRefCount = 0;
+	pRefList->uiShortRefCount =0;
+	pRefList->pNextBuffer = pRefList->pRef[0];
+}
+
+static inline void DeleteLTRFromLongList(sWelsEncCtx*pCtx, int32_t iIdx)
+{
+	SRefList* pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	int32_t k ;
+				
+	for (k= iIdx; k<pRefList->uiLongRefCount-1;k++)	{
+		pRefList->pLongRefList[k]= pRefList->pLongRefList[k+1];
+	}
+	pRefList->pLongRefList[k]= NULL;
+	pRefList->uiLongRefCount--;		
+
+}
+static inline void DeleteSTRFromShortList(sWelsEncCtx*pCtx, int32_t iIdx)
+{
+	SRefList* pRefList = pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	int32_t k ;
+
+	for (k= iIdx; k<pRefList->uiShortRefCount-1;k++)	{
+			pRefList->pShortRefList[k]= pRefList->pShortRefList[k+1];
+	}
+	pRefList->pShortRefList[k]= NULL;
+	pRefList->uiShortRefCount--;
+
+}
+static inline int32_t CompareFrameNum(int32_t iFrameNumA,int32_t iFrameNumB,int32_t iMaxFrameNumPlus1)
+{
+	int64_t iNumA,iNumB,iDiffAB,iDiffMin;
+	if ( iFrameNumA>iMaxFrameNumPlus1 || iFrameNumB>iMaxFrameNumPlus1 ){	return -2;	}
+#define  WelsAbsDiffInt64(a,b) ( (a) > (b) )?( a - b ):( b - a )
+
+	iDiffAB = WelsAbsDiffInt64( (int64_t)(iFrameNumA),(int64_t)(iFrameNumB));
+
+	iDiffMin = iDiffAB;
+	if (iDiffMin == 0){	return FRAME_NUM_EQUAL;	}
+
+	iNumA = WelsAbsDiffInt64( (int64_t)(iFrameNumA+iMaxFrameNumPlus1), (int64_t)(iFrameNumB) );
+	if (iNumA == 0){ return FRAME_NUM_EQUAL; }
+	else if (iDiffMin > iNumA)	{	return FRAME_NUM_BIGGER;	}
+
+	iNumB = WelsAbsDiffInt64( (int64_t)(iFrameNumB+iMaxFrameNumPlus1), (int64_t)(iFrameNumA) );
+	if (iNumB == 0){ return FRAME_NUM_EQUAL; }
+	else if (iDiffMin > iNumB)	{	return FRAME_NUM_SMALLER;	}
+
+	return (iFrameNumA > iFrameNumB)?(FRAME_NUM_BIGGER):(FRAME_NUM_SMALLER);
+	
+}
+/*
+*	delete failed mark according LTR recovery pRequest
+*/
+static inline void DeleteInvalidLTR(sWelsEncCtx *pCtx)
+{
+	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	SPicture** pLongRefList = pRefList->pLongRefList;
+	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+	int32_t iMaxFrameNumPlus1 = (1<<pCtx->pSps->uiLog2MaxFrameNum);
+	int32_t i;
+
+	for( i = 0;i<LONG_TERM_REF_NUM;i++){
+		if ( pLongRefList[i]!=NULL  )	{
+			if ( CompareFrameNum( pLongRefList[i]->iFrameNum , pLtr->iLastCorFrameNumDec,iMaxFrameNumPlus1 ) == FRAME_NUM_BIGGER
+				&&( CompareFrameNum( pLongRefList[i]->iFrameNum , pLtr->iCurFrameNumInDec,iMaxFrameNumPlus1)& (FRAME_NUM_EQUAL|FRAME_NUM_SMALLER) )){			
+				WelsLog(pCtx,WELS_LOG_WARNING,"LTR ,invalid LTR delete ,long_term_idx = %d , iFrameNum =%d \n",pLongRefList[i]->iLongTermPicNum,pLongRefList[i]->iFrameNum);
+				SetUnref(pLongRefList[i]);
+				DeleteLTRFromLongList(pCtx,i);
+				pLtr->bLTRMarkEnable = TRUE;
+				if (pRefList->uiLongRefCount == 0) 	{	pCtx->bEncCurFrmAsIdrFlag = true; }
+			}else if ( CompareFrameNum(pLongRefList[i]->iMarkFrameNum , pLtr->iLastCorFrameNumDec ,iMaxFrameNumPlus1) == FRAME_NUM_BIGGER
+				&& (CompareFrameNum(pLongRefList[i]->iMarkFrameNum, pLtr->iCurFrameNumInDec ,iMaxFrameNumPlus1)&(FRAME_NUM_EQUAL|FRAME_NUM_SMALLER))
+				&& pLtr->iLTRMarkMode == LTR_DELAY_MARK )	{	
+				WelsLog(pCtx,WELS_LOG_WARNING,"LTR ,iMarkFrameNum invalid LTR delete ,long_term_idx = %d , iFrameNum =%d \n",pLongRefList[i]->iLongTermPicNum,pLongRefList[i]->iFrameNum);
+				SetUnref(pLongRefList[i]);
+				DeleteLTRFromLongList(pCtx,i);
+				pLtr->bLTRMarkEnable = TRUE;
+				if (pRefList->uiLongRefCount == 0) 	{	pCtx->bEncCurFrmAsIdrFlag = true; }
+			}
+		}
+	}
+
+}
+/*
+*	handle LTR Mark feedback message
+*/
+static inline void HandleLTRMarkFeedback(sWelsEncCtx *pCtx)
+{	
+	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	SPicture** pLongRefList		= pRefList->pLongRefList;
+	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+	int32_t i,j;
+
+	if (pLtr->uiLtrMarkState == LTR_MARKING_SUCCESS){	
+		WelsLog(pCtx,WELS_LOG_WARNING,"pLtr->uiLtrMarkState = %d, pLtr.iCurLtrIdx = %d , pLtr->iLtrMarkFbFrameNum = %d ,pCtx->iFrameNum = %d ",pLtr->uiLtrMarkState,pLtr->iCurLtrIdx, pLtr->iLtrMarkFbFrameNum,pCtx->iFrameNum);
+		for ( i = 0; i<pRefList->uiLongRefCount; i++)	{
+			if (pLongRefList[i]->iFrameNum == pLtr->iLtrMarkFbFrameNum && pLongRefList[i]->uiRecieveConfirmed != RECIEVE_SUCCESS){	
+		
+				pLongRefList[i]->uiRecieveConfirmed = RECIEVE_SUCCESS;
+				pCtx->pVaa->uiValidLongTermPicIdx = pLongRefList[i]->iLongTermPicNum;
+
+				pLtr->iCurFrameNumInDec  =
+				pLtr->iLastRecoverFrameNum = 
+				pLtr->iLastCorFrameNumDec = pLtr->iLtrMarkFbFrameNum;
+		
+				for ( j = 0;j<pRefList->uiLongRefCount;j++)	{
+					if(pLongRefList[j]->iLongTermPicNum != pLtr->iCurLtrIdx)	{
+						SetUnref(pLongRefList[j]);
+						DeleteLTRFromLongList(pCtx,j);
+					}
+				}	
+		
+				pLtr->iLTRMarkSuccessNum++;
+				pLtr->iCurLtrIdx = (++pLtr->iCurLtrIdx%LONG_TERM_REF_NUM);
+				pLtr->iLTRMarkMode = ( pLtr->iLTRMarkSuccessNum >= (LONG_TERM_REF_NUM) )?( LTR_DELAY_MARK):(LTR_DIRECT_MARK);
+				WelsLog(pCtx,WELS_LOG_WARNING,"LTR mark mode =%d",pLtr->iLTRMarkMode);
+				pLtr->bLTRMarkEnable = TRUE;
+				break;		
+			}
+		}
+		pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK;
+	}else if (pLtr->uiLtrMarkState == LTR_MARKING_FAILED){
+		for ( i =0; i < pRefList->uiLongRefCount; i++)	{
+			if (pLongRefList[i]->iFrameNum == pLtr->iLtrMarkFbFrameNum)	{
+				SetUnref(pLongRefList[i]);
+				DeleteLTRFromLongList(pCtx,i);
+				break;
+			}
+		}
+		pLtr->uiLtrMarkState = NO_LTR_MARKING_FEEDBACK;
+		pLtr->bLTRMarkEnable = TRUE;
+
+		if (pLtr->iLTRMarkSuccessNum == 0){pCtx->bEncCurFrmAsIdrFlag = true;} // no LTR , means IDR recieve failed, force next frame IDR		
+	}
+}
+/*
+ *	LTR mark process
+ */
+static inline void LTRMarkProcess(sWelsEncCtx *pCtx)
+{
+	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	SPicture** pLongRefList = pRefList->pLongRefList;
+	SPicture** pShortRefList = pRefList->pShortRefList;
+	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+	int32_t iGoPFrameNumInterval = ( (pCtx->pSvcParam->uiGopSize>>1)>1 )?( pCtx->pSvcParam->uiGopSize>>1 ):( 1 );
+	int32_t iMaxFrameNumPlus1 = (1<<pCtx->pSps->uiLog2MaxFrameNum);
+	int32_t i = 0;
+	int32_t j = 0;
+	bool_t bMoveLtrFromShortToLong = false;
+
+	if (pCtx->eSliceType == I_SLICE )	{
+		i=0;
+		pShortRefList[i]->uiRecieveConfirmed = RECIEVE_SUCCESS;
+	}else if ( pLtr->bLTRMarkingFlag){
+		pCtx->pVaa->uiMarkLongTermPicIdx = pLtr->iCurLtrIdx;
+
+		if (pLtr->iLTRMarkMode == LTR_DELAY_MARK)	{
+			for (i = 0; i<pRefList->uiShortRefCount; i++)	{			
+				if( CompareFrameNum(pCtx->iFrameNum,pShortRefList[i]->iFrameNum+iGoPFrameNumInterval,iMaxFrameNumPlus1)==FRAME_NUM_EQUAL)
+				{	break;	}	
+			}	
+		}
+	}
+
+	if (pCtx->eSliceType == I_SLICE || pLtr->bLTRMarkingFlag){	
+		pShortRefList[i]->bIsLongRef = true;
+		pShortRefList[i]->iLongTermPicNum = pLtr->iCurLtrIdx;	
+		pShortRefList[i]->iMarkFrameNum = pCtx->iFrameNum;
+	}
+	
+	// delay one gop to move LTR from int16_t list to int32_t list
+	if (pLtr->iLTRMarkMode == LTR_DIRECT_MARK && pCtx->eSliceType != I_SLICE && !pLtr->bLTRMarkingFlag ){
+		for (j = 0; j<pRefList->uiShortRefCount;j++){
+			if ( pRefList->pShortRefList[j]->bIsLongRef)	{
+				i = j;
+				bMoveLtrFromShortToLong = true;
+				break;
+			}
+		}
+	}
+
+	if ( (pLtr->iLTRMarkMode == LTR_DELAY_MARK && pLtr->bLTRMarkingFlag) || ( (pLtr->iLTRMarkMode == LTR_DIRECT_MARK) && (bMoveLtrFromShortToLong) ) )
+	{
+		if (pRefList->uiLongRefCount>0)
+		{
+			memmove(&pRefList->pLongRefList[1],&pRefList->pLongRefList[0],pRefList->uiLongRefCount*sizeof(SPicture*));	// confirmed_safe_unsafe_usage
+		}
+		pLongRefList[0]	 = pShortRefList[i];
+		pRefList->uiLongRefCount++;
+		DeleteSTRFromShortList(pCtx,i);
+	}
+}
+static inline void PrefetchNextBuffer(sWelsEncCtx *pCtx)
+{	
+	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;
+	int32_t i;
+	
+	pRefList->pNextBuffer = NULL;
+	for (i = 0; i<kiNumRef+1;i++){
+		if (!pRefList->pRef[i]->bUsedAsRef){
+			pRefList->pNextBuffer = pRefList->pRef[i];
+			break;
+		}
+	}
+
+	if (pRefList->pNextBuffer == NULL && pRefList->uiShortRefCount>0){
+		pRefList->pNextBuffer = pRefList->pShortRefList[pRefList->uiShortRefCount-1];
+		SetUnref(pRefList->pNextBuffer);
+	}
+
+	pCtx->pDecPic = pRefList->pNextBuffer;
+}
+
+/*
+ *	update reference picture list
+ */
+BOOL_T WelsUpdateRefList( sWelsEncCtx *pCtx )
+{
+	SRefList * pRefList		= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	SLTRState* pLtr			= &pCtx->pLtr[pCtx->uiDependencyId];
+	SDLayerParam *pParamD	= &pCtx->pSvcParam->sDependencyLayers[pCtx->uiDependencyId];
+	const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;
+
+	int32_t iRefIdx			= 0;
+	const uint8_t kuiTid		= pCtx->uiTemporalId;
+	const uint8_t kuiDid		= pCtx->uiDependencyId;
+	const EWelsSliceType keSliceType		= pCtx->eSliceType;		
+	const int32_t kiSwapIdx = (pCtx->eSliceType == P_SLICE )?( kiNumRef-LONG_TERM_REF_NUM ):( (pCtx->pSvcParam->bEnableLongTermReference)?(kiNumRef - pLtr->iCurLtrIdx):(1) );
+	uint32_t i = 0;
+	// Need update pRef list in case store base layer or target dependency layer construction
+	if ( NULL == pCtx->pCurDqLayer )
+		return FALSE;
+
+	if ( NULL == pRefList || NULL == pRefList->pRef[0] || NULL == pRefList->pRef[kiSwapIdx] )
+		return FALSE;
+
+	if ( (NULL != pCtx->pDecPic)
+#if !defined(ENABLE_FRAME_DUMP)	// to save complexity, 1/6/2009
+		 && (pParamD->iHighestTemporalId == 0 || kuiTid < pParamD->iHighestTemporalId)
+#endif// !ENABLE_FRAME_DUMP
+	)
+		// Expanding picture for future reference
+		ExpandReferencingPicture( pCtx->pDecPic, pCtx->pFuncList->pfExpandLumaPicture, pCtx->pFuncList->pfExpandChromaPicture );
+
+	// move picture in list
+	pCtx->pDecPic->uiTemporalId = kuiTid;
+	pCtx->pDecPic->uiSpatialId	= kuiDid;
+	pCtx->pDecPic->iFrameNum		= pCtx->iFrameNum;
+	pCtx->pDecPic->iFramePoc		= pCtx->iPOC;
+	pCtx->pDecPic->uiRecieveConfirmed = RECIEVE_UNKOWN;
+	pCtx->pDecPic->bUsedAsRef	= true;
+
+	for (iRefIdx = pRefList->uiShortRefCount-1;iRefIdx>=0;--iRefIdx)	{
+		pRefList->pShortRefList[iRefIdx+1] = pRefList->pShortRefList[iRefIdx];
+	}
+	pRefList->pShortRefList[0] = pCtx->pDecPic;
+	pRefList->uiShortRefCount++;
+
+	if ( keSliceType == P_SLICE ){
+		if (pCtx->uiTemporalId == 0)
+		{
+			if (pCtx->pSvcParam->bEnableLongTermReference)	{
+				LTRMarkProcess(pCtx);
+				DeleteInvalidLTR(pCtx);	
+				HandleLTRMarkFeedback(pCtx);
+
+				pLtr->bReceivedT0LostFlag = FALSE; // reset to false due to the recovery is finished
+				pLtr->bLTRMarkingFlag = FALSE;	
+				++pLtr->uiLtrMarkInterval;		
+			}
+
+			for (i = pRefList->uiShortRefCount-1;i>0;i--){		
+				SetUnref(pRefList->pShortRefList[i]);
+				DeleteSTRFromShortList(pCtx,i);
+			}
+			if (pRefList->uiShortRefCount>0 && (pRefList->pShortRefList[0]->uiTemporalId>0 || pRefList->pShortRefList[0]->iFrameNum != pCtx->iFrameNum))
+			{
+				SetUnref(pRefList->pShortRefList[0]);
+				DeleteSTRFromShortList(pCtx,0);
+			}
+		}
+	}else{	// in case IDR currently coding	
+		if (pCtx->pSvcParam->bEnableLongTermReference)	{
+			LTRMarkProcess(pCtx);
+
+			pLtr->iCurLtrIdx = (++pLtr->iCurLtrIdx%LONG_TERM_REF_NUM);
+			pLtr->iLTRMarkSuccessNum = 1; //IDR default suceess
+			pLtr->bLTRMarkEnable =  TRUE;
+			pLtr->uiLtrMarkInterval = 0;
+
+			pCtx->pVaa->uiValidLongTermPicIdx = 0;
+			pCtx->pVaa->uiMarkLongTermPicIdx = 0;
+		}
+	}
+	PrefetchNextBuffer(pCtx);
+	return TRUE;
+}
+
+bool_t CheckCurMarkFrameNumUsed(sWelsEncCtx *pCtx)
+{
+	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+	SRefList *pRefList	= pCtx->ppRefPicListExt[pCtx->uiDependencyId];
+	SPicture** pLongRefList = pRefList->pLongRefList;
+	int32_t iGoPFrameNumInterval = ( (pCtx->pSvcParam->uiGopSize>>1)>1 )?( pCtx->pSvcParam->uiGopSize>>1 ):( 1 );
+	int32_t iMaxFrameNumPlus1 = (1<<pCtx->pSps->uiLog2MaxFrameNum);
+	int32_t i;
+
+	for (i = 0;i<pRefList->uiLongRefCount;i++){
+		if( ( pCtx->iFrameNum == pLongRefList[i]->iFrameNum &&pLtr->iLTRMarkMode == LTR_DIRECT_MARK ) ||
+		    ( CompareFrameNum(pCtx->iFrameNum + iGoPFrameNumInterval,pLongRefList[i]->iFrameNum,iMaxFrameNumPlus1)== FRAME_NUM_EQUAL  && pLtr->iLTRMarkMode == LTR_DELAY_MARK))
+		{
+			return FALSE;
+		}
+	}
+	
+	return TRUE;
+}
+void WelsMarkPic( sWelsEncCtx *pCtx)
+{
+	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+	const int32_t kiCountSliceNum			= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
+	int32_t iGoPFrameNumInterval = ((pCtx->pSvcParam->uiGopSize>>1)>1)?(pCtx->pSvcParam->uiGopSize>>1):(1);
+	int32_t iSliceIdx = 0;
+
+	if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bLTRMarkEnable && pCtx->uiTemporalId == 0){ 
+		if (  !pLtr->bReceivedT0LostFlag && pLtr->uiLtrMarkInterval > pCtx->pSvcParam->uiLtrMarkPeriod 
+			&& CheckCurMarkFrameNumUsed(pCtx)){
+				pLtr->bLTRMarkingFlag = TRUE;
+				pLtr->bLTRMarkEnable = FALSE;
+				pLtr->uiLtrMarkInterval = 0;
+				pLtr->iLastLtrIdx = pLtr->iCurLtrIdx;
+		}else{
+			pLtr->bLTRMarkingFlag = FALSE;
+		}
+	}
+
+	for (iSliceIdx = 0; iSliceIdx<kiCountSliceNum;iSliceIdx++)	{	
+		SSliceHeaderExt	*pSliceHdrExt		= &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[iSliceIdx].sSliceHeaderExt;
+		SSliceHeader		*pSliceHdr			= &pSliceHdrExt->sSliceHeader;
+		SRefPicMarking		*pRefPicMark		= &pSliceHdr->sRefMarking;	
+
+		memset( pRefPicMark, 0, sizeof(SRefPicMarking) );
+
+		if (iSliceIdx != kiCountSliceNum-1)	{ //marking syntax only exist in last slice head
+			continue; 
+		}
+		if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bLTRMarkingFlag){	
+			if (pLtr->iLTRMarkMode == LTR_DIRECT_MARK)	{
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iMaxLongTermFrameIdx = LONG_TERM_REF_NUM-1;
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SET_MAX_LONG;	
+
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iDiffOfPicNum = iGoPFrameNumInterval;
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SHORT2UNUSED;
+			
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iLongTermFrameIdx = pLtr->iCurLtrIdx;
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_LONG;
+			}else if (pLtr->iLTRMarkMode == LTR_DELAY_MARK )	{
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iDiffOfPicNum = iGoPFrameNumInterval;
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount].iLongTermFrameIdx = pLtr->iCurLtrIdx;
+				pRefPicMark->SMmcoRef[pRefPicMark->uiMmcoCount++].iMmcoType = MMCO_SHORT2LONG;
+			}
+		}
+	}
+}
+
+int32_t FilterLTRRecoveryRequest(sWelsEncCtx *pCtx,SLTRRecoverRequest* pLTRRecoverRequest)
+{
+	SLTRRecoverRequest* pRequest = pLTRRecoverRequest;
+	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+	int32_t iMaxFrameNumPlus1 = (1<<pCtx->pSps->uiLog2MaxFrameNum);
+	if ( pCtx->pSvcParam->bEnableLongTermReference )
+	{
+		 if( pRequest->uiFeedbackType == LTR_RECOVERY_REQUEST &&  pRequest->uiIDRPicId == pCtx->sPSOVector.uiIdrPicId)
+		 {
+			if(pRequest->iLastCorrectFrameNum == -1){
+				pCtx->bEncCurFrmAsIdrFlag = true;
+				return TRUE;
+			}else if (pRequest->iCurrentFrameNum == -1){
+				pLtr->bReceivedT0LostFlag = true;
+				return TRUE;
+			}else if( ( CompareFrameNum( pLtr->iLastRecoverFrameNum , pRequest->iLastCorrectFrameNum,iMaxFrameNumPlus1) & (FRAME_NUM_EQUAL|FRAME_NUM_SMALLER) )// t0 lost
+				||( ( CompareFrameNum(pLtr->iLastRecoverFrameNum , pRequest->iCurrentFrameNum,iMaxFrameNumPlus1) & ( FRAME_NUM_EQUAL|FRAME_NUM_SMALLER ) )&&
+				CompareFrameNum(pLtr->iLastRecoverFrameNum , pRequest->iLastCorrectFrameNum,iMaxFrameNumPlus1) == FRAME_NUM_BIGGER ) ){// recovery failed
+					
+				pLtr->bReceivedT0LostFlag = true;
+				pLtr->iLastCorFrameNumDec = pRequest->iLastCorrectFrameNum;
+				pLtr->iCurFrameNumInDec = pRequest->iCurrentFrameNum;
+				WelsLog(pCtx,WELS_LOG_INFO,"Receive valid LTR recovery pRequest,feedback_type = %d ,uiIdrPicId = %d , current_frame_num = %d , last correct frame num = %d"
+					,pRequest->uiFeedbackType,pRequest->uiIDRPicId,pRequest->iCurrentFrameNum,pRequest->iLastCorrectFrameNum);
+			}
+
+			WelsLog(pCtx,WELS_LOG_INFO,"Receive LTR recovery pRequest,feedback_type = %d ,uiIdrPicId = %d , current_frame_num = %d , last correct frame num = %d"
+					,pRequest->uiFeedbackType,pRequest->uiIDRPicId,pRequest->iCurrentFrameNum,pRequest->iLastCorrectFrameNum);	
+		 }
+	}else if (!pCtx->pSvcParam->bEnableLongTermReference){
+		pCtx->bEncCurFrmAsIdrFlag = TRUE;
+	}
+	return TRUE;
+}
+void FilterLTRMarkingFeedback(sWelsEncCtx *pCtx,SLTRMarkingFeedback* pLTRMarkingFeedback)
+{
+	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+	assert(pLTRMarkingFeedback);
+	if ( pCtx->pSvcParam->bEnableLongTermReference )	{
+		if ( pLTRMarkingFeedback->uiIDRPicId == pCtx->sPSOVector.uiIdrPicId 
+			&&( pLTRMarkingFeedback->uiFeedbackType == LTR_MARKING_SUCCESS || pLTRMarkingFeedback->uiFeedbackType == LTR_MARKING_FAILED))// avoid error pData
+		{	
+			pLtr->uiLtrMarkState = pLTRMarkingFeedback->uiFeedbackType;
+			pLtr->iLtrMarkFbFrameNum =  pLTRMarkingFeedback->iLTRFrameNum ;
+			WelsLog(pCtx,WELS_LOG_INFO,"Receive valid LTR marking feedback, feedback_type = %d , uiIdrPicId = %d , LTR_frame_num = %d , cur_idr_pic_id = %d",pLTRMarkingFeedback->uiFeedbackType,pLTRMarkingFeedback->uiIDRPicId,pLTRMarkingFeedback->iLTRFrameNum , pCtx->sPSOVector.uiIdrPicId);
+
+		}else{
+			WelsLog(pCtx,WELS_LOG_INFO,"Receive LTR marking feedback, feedback_type = %d , uiIdrPicId = %d , LTR_frame_num = %d , cur_idr_pic_id = %d",pLTRMarkingFeedback->uiFeedbackType,pLTRMarkingFeedback->uiIDRPicId,pLTRMarkingFeedback->iLTRFrameNum , pCtx->sPSOVector.uiIdrPicId);
+		}
+	}
+}
+
+/*
+ *	build reference picture list
+ */
+BOOL_T WelsBuildRefList( sWelsEncCtx *pCtx, const int32_t iPOC )
+{	
+	SRefList *pRefList		=  pCtx->ppRefPicListExt[pCtx->uiDependencyId];	
+	SLTRState* pLtr			= &pCtx->pLtr[pCtx->uiDependencyId];
+	const int32_t kiNumRef	= pCtx->pSvcParam->iNumRefFrame;	
+	const uint8_t kuiTid		= pCtx->uiTemporalId;	
+	uint32_t i				= 0;
+
+	// to support any type of cur_dq->mgs_control
+	//	[ 0:	using current layer to do ME/MC;
+	//	  -1:	using store base layer to do ME/MC;
+	//	  2:	using highest layer to do ME/MC; ]
+
+	// build reference list 0/1 if applicable
+
+	pCtx->iNumRef0	= 0;
+	
+	if ( pCtx->eSliceType != I_SLICE )
+	{
+		if (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bReceivedT0LostFlag && pCtx->uiTemporalId == 0){
+			for ( i = 0;i <pRefList->uiLongRefCount;i++)	{
+				if (pRefList->pLongRefList[i]->uiRecieveConfirmed == RECIEVE_SUCCESS)	{
+					pCtx->pRefList0[pCtx->iNumRef0++] = pRefList->pLongRefList[i];
+					pLtr->iLastRecoverFrameNum = pCtx->iFrameNum;
+					WelsLog(pCtx,WELS_LOG_INFO,"pRef is int32_t !iLastRecoverFrameNum = %d, pRef iFrameNum = %d,LTR number = %d,",pLtr->iLastRecoverFrameNum,pCtx->pRefList0[0]->iFrameNum,pRefList->uiLongRefCount);
+					break;
+				}
+			}
+		}else{
+			for ( i = 0; i < pRefList->uiShortRefCount; ++ i )
+			{
+				SPicture *pRef = pRefList->pShortRefList[i];
+				if ( pRef != NULL && pRef->bUsedAsRef && pRef->iFramePoc >= 0 && pRef->uiTemporalId <= kuiTid)
+				{		
+					pCtx->pRefList0[pCtx->iNumRef0++]	= pRef;
+					break;	
+				}
+			}
+		}	
+	}
+	else	// safe for IDR
+	{
+		WelsResetRefList( pCtx ); //for IDR, SHOULD reset pRef list. 
+		ResetLtrState(&pCtx->pLtr[pCtx->uiDependencyId]); //SHOULD update it when IDR.
+		pCtx->pRefList0[0]	= NULL;
+	}
+
+	if ( pCtx->iNumRef0 > kiNumRef )
+		pCtx->iNumRef0 = kiNumRef;
+	return ( pCtx->iNumRef0>0 || pCtx->eSliceType == I_SLICE) ? ( TRUE ): ( FALSE );
+}
+
+/*
+ *	update syntax for reference base related
+ */
+void WelsUpdateRefSyntax( sWelsEncCtx *pCtx, const int32_t iPOC, const int32_t uiFrameType )
+{
+	SLTRState* pLtr = &pCtx->pLtr[pCtx->uiDependencyId];
+	int32_t iIdx								= 0;
+	const int32_t kiCountSliceNum			= GetCurrentSliceNum( pCtx->pCurDqLayer->pSliceEncCtx );
+	int32_t	iAbsDiffPicNumMinus1			= -1;
+
+	assert( kiCountSliceNum > 0 );
+	
+	/*syntax for ref_pic_list_reordering()*/
+	if( pCtx->iNumRef0 > 0 )
+		iAbsDiffPicNumMinus1 = pCtx->iFrameNum - (pCtx->pRefList0[0]->iFrameNum) -1;
+	
+	for (iIdx = 0;iIdx < kiCountSliceNum;iIdx++) {
+		SSliceHeaderExt	*pSliceHdrExt		= &pCtx->pCurDqLayer->sLayerInfo.pSliceInLayer[iIdx].sSliceHeaderExt;	
+		SSliceHeader		*pSliceHdr			= &pSliceHdrExt->sSliceHeader;
+		SRefPicListReorderSyntax *pRefReorder	= &pSliceHdr->sRefReordering;
+		SRefPicMarking *pRefPicMark			= &pSliceHdr->sRefMarking;	
+	
+		/*syntax for num_ref_idx_l0_active_minus1*/
+		pSliceHdr->uiRefCount = pCtx->iNumRef0;
+		if( pCtx->iNumRef0 > 0 )
+		{	
+			if( !pCtx->pRefList0[0]->bIsLongRef )	
+			{
+ 				if ( iAbsDiffPicNumMinus1 < 0 )
+				{
+ 					WelsLog( pCtx, WELS_LOG_INFO, "WelsUpdateRefSyntax():::uiAbsDiffPicNumMinus1:%d\n", iAbsDiffPicNumMinus1 );
+ 					iAbsDiffPicNumMinus1 += (1 << (pCtx->pSps->uiLog2MaxFrameNum));
+ 					WelsLog( pCtx, WELS_LOG_INFO, "WelsUpdateRefSyntax():::uiAbsDiffPicNumMinus1< 0, update as:%d\n", iAbsDiffPicNumMinus1 );
+ 				}
+			
+ 				pRefReorder->SReorderingSyntax[0].uiReorderingOfPicNumsIdc = 0;
+ 				pRefReorder->SReorderingSyntax[0].uiAbsDiffPicNumMinus1    = iAbsDiffPicNumMinus1;
+ 				pRefReorder->SReorderingSyntax[1].uiReorderingOfPicNumsIdc = 3;	
+			}
+			else
+			{
+				pRefReorder->SReorderingSyntax[0].uiReorderingOfPicNumsIdc = 2;
+				pRefReorder->SReorderingSyntax[0].iLongTermPicNum = pCtx->pRefList0[0]->iLongTermPicNum;
+				pRefReorder->SReorderingSyntax[1].uiReorderingOfPicNumsIdc = 3;
+			}
+		}
+		
+		/*syntax for dec_ref_pic_marking()*/
+		if( WELS_FRAME_TYPE_IDR == uiFrameType )		{
+			pRefPicMark->bNoOutputOfPriorPicsFlag = false;
+			pRefPicMark->bLongTermRefFlag = pCtx->pSvcParam->bEnableLongTermReference;
+		}else{
+ 			pRefPicMark->bAdaptiveRefPicMarkingModeFlag = (pCtx->pSvcParam->bEnableLongTermReference && pLtr->bLTRMarkingFlag)?(true):(false);
+		}		
+	}
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/sample.cpp
@@ -1,0 +1,531 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	sample.c
+ *
+ * \brief	compute SAD and SATD
+ *
+ * \date	2009.06.02 Created
+ *
+ *************************************************************************************
+ */
+
+#include "sample.h"
+#include "macros.h"
+
+#include "mc.h"
+#include "cpu_core.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+int32_t WelsSampleSad4x4_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSadSum = 0;
+	int32_t i = 0;
+	uint8_t* pSrc1 = pSample1;
+	uint8_t* pSrc2 = pSample2;
+	for ( i = 0; i < 4; i++ )
+	{
+		iSadSum += WELS_ABS( ( pSrc1[0] - pSrc2[0] ) );
+		iSadSum += WELS_ABS( ( pSrc1[1] - pSrc2[1] ) );
+		iSadSum += WELS_ABS( ( pSrc1[2] - pSrc2[2] ) );
+		iSadSum += WELS_ABS( ( pSrc1[3] - pSrc2[3] ) );
+
+		pSrc1 += iStride1;
+		pSrc2 += iStride2;
+	}
+
+	return iSadSum;
+} 
+
+int32_t WelsSampleSad8x8_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSadSum = 0;
+	int32_t i = 0;
+	uint8_t* pSrc1 = pSample1;
+	uint8_t* pSrc2 = pSample2;
+	for ( i = 0; i < 8; i++ )
+	{
+		iSadSum += WELS_ABS( ( pSrc1[0] - pSrc2[0] ) );
+		iSadSum += WELS_ABS( ( pSrc1[1] - pSrc2[1] ) );
+		iSadSum += WELS_ABS( ( pSrc1[2] - pSrc2[2] ) );
+		iSadSum += WELS_ABS( ( pSrc1[3] - pSrc2[3] ) );
+		iSadSum += WELS_ABS( ( pSrc1[4] - pSrc2[4] ) );
+		iSadSum += WELS_ABS( ( pSrc1[5] - pSrc2[5] ) );
+		iSadSum += WELS_ABS( ( pSrc1[6] - pSrc2[6] ) );
+		iSadSum += WELS_ABS( ( pSrc1[7] - pSrc2[7] ) );
+
+		pSrc1 += iStride1;
+		pSrc2 += iStride2;
+	}
+
+	return iSadSum;
+} 
+int32_t WelsSampleSad16x8_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSadSum = 0;
+
+	iSadSum += WelsSampleSad8x8_c( pSample1,     iStride1, pSample2,     iStride2 );
+	iSadSum += WelsSampleSad8x8_c( pSample1 + 8, iStride1, pSample2 + 8, iStride2 );
+
+	return iSadSum;
+} 
+int32_t WelsSampleSad8x16_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSadSum = 0;
+	iSadSum += WelsSampleSad8x8_c( pSample1,                   iStride1, pSample2,                   iStride2 );
+	iSadSum += WelsSampleSad8x8_c( pSample1+(iStride1<<3), iStride1, pSample2+(iStride2<<3), iStride2 );
+
+	return iSadSum;
+} 
+int32_t WelsSampleSad16x16_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSadSum = 0;
+	iSadSum += WelsSampleSad8x8_c( pSample1,                     iStride1, pSample2,                     iStride2 );
+	iSadSum += WelsSampleSad8x8_c( pSample1+8,                   iStride1, pSample2+8,                   iStride2 );
+	iSadSum += WelsSampleSad8x8_c( pSample1+(iStride1<<3),   iStride1, pSample2+(iStride2<<3),   iStride2 );
+	iSadSum += WelsSampleSad8x8_c( pSample1+(iStride1<<3)+8, iStride1, pSample2+(iStride2<<3)+8, iStride2 );
+
+	return iSadSum;
+} 
+
+int32_t WelsSampleSatd4x4_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSatdSum = 0;
+	int32_t pSampleMix[4][4] = { 0 };
+	int32_t iSample0, iSample1, iSample2, iSample3;
+	int32_t i = 0;
+	uint8_t* pSrc1 = pSample1;
+	uint8_t* pSrc2 = pSample2;
+
+	//step 1: get the difference
+	for( i = 0; i < 4; i++ )
+	{
+		pSampleMix[i][0] = pSrc1[0] - pSrc2[0];
+		pSampleMix[i][1] = pSrc1[1] - pSrc2[1];
+		pSampleMix[i][2] = pSrc1[2] - pSrc2[2];
+		pSampleMix[i][3] = pSrc1[3] - pSrc2[3];
+
+		pSrc1 += iStride1;
+		pSrc2 += iStride2;
+	}
+
+	//step 2: horizontal transform
+	for ( i = 0; i < 4; i++ )
+	{
+		iSample0 = pSampleMix[i][0] + pSampleMix[i][2];
+		iSample1 = pSampleMix[i][1] + pSampleMix[i][3];
+		iSample2 = pSampleMix[i][0] - pSampleMix[i][2];
+		iSample3 = pSampleMix[i][1] - pSampleMix[i][3];
+
+		pSampleMix[i][0] = iSample0 + iSample1;		
+		pSampleMix[i][1] = iSample2 + iSample3;
+		pSampleMix[i][2] = iSample2 - iSample3;
+		pSampleMix[i][3] = iSample0 - iSample1;
+	}
+
+	//step 3: vertical transform and get the sum of SATD
+	for ( i = 0; i < 4; i++ )
+	{
+		iSample0 = pSampleMix[0][i] + pSampleMix[2][i];
+		iSample1 = pSampleMix[1][i] + pSampleMix[3][i];
+		iSample2 = pSampleMix[0][i] - pSampleMix[2][i];
+		iSample3 = pSampleMix[1][i] - pSampleMix[3][i];
+
+		pSampleMix[0][i] = iSample0 + iSample1;		
+		pSampleMix[1][i] = iSample2 + iSample3;
+		pSampleMix[2][i] = iSample2 - iSample3;
+		pSampleMix[3][i] = iSample0 - iSample1;
+
+		iSatdSum += ( WELS_ABS( pSampleMix[0][i] ) + WELS_ABS( pSampleMix[1][i] ) + WELS_ABS( pSampleMix[2][i] ) + WELS_ABS( pSampleMix[3][i] ) );
+	}
+
+	return ( (iSatdSum+1)>>1 );
+}
+int32_t WelsSampleSatd8x8_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSatdSum = 0;
+
+	iSatdSum += WelsSampleSatd4x4_c( pSample1,                     iStride1, pSample2,                     iStride2 );
+	iSatdSum += WelsSampleSatd4x4_c( pSample1+4,                   iStride1, pSample2+4,                   iStride2 );
+	iSatdSum += WelsSampleSatd4x4_c( pSample1+(iStride1<<2),   iStride1, pSample2+(iStride2<<2),   iStride2 );
+	iSatdSum += WelsSampleSatd4x4_c( pSample1+(iStride1<<2)+4, iStride1, pSample2+(iStride2<<2)+4, iStride2 );
+
+	return iSatdSum;
+}
+int32_t WelsSampleSatd16x8_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSatdSum = 0;
+
+	iSatdSum += WelsSampleSatd8x8_c( pSample1,   iStride1, pSample2,   iStride2 );
+	iSatdSum += WelsSampleSatd8x8_c( pSample1+8, iStride1, pSample2+8, iStride2 );
+
+	return iSatdSum;
+}
+int32_t WelsSampleSatd8x16_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSatdSum = 0;
+
+	iSatdSum += WelsSampleSatd8x8_c( pSample1,                   iStride1, pSample2,                   iStride2 );
+	iSatdSum += WelsSampleSatd8x8_c( pSample1+(iStride1<<3), iStride1, pSample2+(iStride2<<3), iStride2 );
+
+	return iSatdSum;
+}
+int32_t WelsSampleSatd16x16_c( uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2 )
+{
+	int32_t iSatdSum = 0;
+
+	iSatdSum += WelsSampleSatd8x8_c( pSample1,                     iStride1, pSample2,                     iStride2 );
+	iSatdSum += WelsSampleSatd8x8_c( pSample1+8,                   iStride1, pSample2+8,                   iStride2 );
+	iSatdSum += WelsSampleSatd8x8_c( pSample1+(iStride1<<3),   iStride1, pSample2+(iStride2<<3),   iStride2 );
+	iSatdSum += WelsSampleSatd8x8_c( pSample1+(iStride1<<3)+8, iStride1, pSample2+(iStride2<<3)+8, iStride2 );
+
+	return iSatdSum;
+}
+
+
+void WelsSampleSadFour16x16_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
+{
+	*(pSad)     = WelsSampleSad16x16_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
+	*(pSad + 1) = WelsSampleSad16x16_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
+	*(pSad + 2) = WelsSampleSad16x16_c(iSample1, iStride1, (iSample2-1), iStride2);
+	*(pSad + 3) = WelsSampleSad16x16_c(iSample1, iStride1, (iSample2+1), iStride2);
+}
+void WelsSampleSadFour16x8_c(uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
+{
+	*(pSad)     = WelsSampleSad16x8_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
+	*(pSad + 1) = WelsSampleSad16x8_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
+	*(pSad + 2) = WelsSampleSad16x8_c(iSample1, iStride1, (iSample2-1), iStride2);
+	*(pSad + 3) = WelsSampleSad16x8_c(iSample1, iStride1, (iSample2+1), iStride2);
+}
+void WelsSampleSadFour8x16_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
+{
+	*(pSad)     = WelsSampleSad8x16_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
+	*(pSad + 1) = WelsSampleSad8x16_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
+	*(pSad + 2) = WelsSampleSad8x16_c(iSample1, iStride1, (iSample2-1), iStride2);
+	*(pSad + 3) = WelsSampleSad8x16_c(iSample1, iStride1, (iSample2+1), iStride2);
+
+}
+void WelsSampleSadFour8x8_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
+{
+	*(pSad)     = WelsSampleSad8x8_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
+	*(pSad + 1) = WelsSampleSad8x8_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
+	*(pSad + 2) = WelsSampleSad8x8_c(iSample1, iStride1, (iSample2-1), iStride2);
+	*(pSad + 3) = WelsSampleSad8x8_c(iSample1, iStride1, (iSample2+1), iStride2);
+}
+void WelsSampleSadFour4x4_c( uint8_t *iSample1, int32_t iStride1, uint8_t *iSample2, int32_t iStride2, int32_t* pSad)
+{
+	*(pSad)     = WelsSampleSad4x4_c(iSample1, iStride1, (iSample2-iStride2), iStride2);
+	*(pSad + 1) = WelsSampleSad4x4_c(iSample1, iStride1, (iSample2+iStride2), iStride2);
+	*(pSad + 2) = WelsSampleSad4x4_c(iSample1, iStride1, (iSample2-1), iStride2);
+	*(pSad + 3) = WelsSampleSad4x4_c(iSample1, iStride1, (iSample2+1), iStride2);
+}
+
+extern void WelsI4x4LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+extern void WelsI4x4LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+extern void WelsI4x4LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+
+int32_t WelsSampleSatdIntra4x4Combined3_c(uint8_t *pDec, int32_t iDecStride, uint8_t *pEnc, int32_t iEncStride, uint8_t *pDst, 
+						  int32_t *pBestMode, int32_t iLambda2, int32_t iLambda1, int32_t iLambda0)
+{
+	int32_t iBestMode = -1;
+	int32_t iCurCost, iBestCost = INT_MAX;
+	ENFORCE_STACK_ALIGN_2D(uint8_t, uiLocalBuffer, 3, 16, 16)
+	
+	WelsI4x4LumaPredDc_c(uiLocalBuffer[2], pDec, iDecStride);
+	iCurCost = WelsSampleSatd4x4_c(uiLocalBuffer[2], 4, pEnc, iEncStride) + iLambda2;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 2;
+		iBestCost = iCurCost;
+	}
+
+	WelsI4x4LumaPredH_c(uiLocalBuffer[1], pDec, iDecStride);
+	iCurCost = WelsSampleSatd4x4_c(uiLocalBuffer[1], 4, pEnc, iEncStride) + iLambda1;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 1;
+		iBestCost = iCurCost;
+	}
+	WelsI4x4LumaPredV_c(uiLocalBuffer[0], pDec, iDecStride);
+	iCurCost = WelsSampleSatd4x4_c(uiLocalBuffer[0], 4, pEnc, iEncStride) + iLambda0;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 0;
+		iBestCost = iCurCost;
+	}
+
+	memcpy(pDst, uiLocalBuffer[iBestMode], 16*sizeof(uint8_t));	// confirmed_safe_unsafe_usage
+	*pBestMode = iBestMode;
+
+	return iBestCost;
+}
+extern void WelsIChormaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+extern void WelsIChormaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+extern void WelsIChormaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+
+int32_t WelsSampleSatdIntra8x8Combined3_c(uint8_t *pDecCb, int32_t iDecStride, uint8_t *pEncCb, int32_t iEncStride, 
+							int32_t *pBestMode, int32_t iLambda, uint8_t *pDstChroma,uint8_t *pDecCr,uint8_t *pEncCr)
+{
+	int32_t iBestMode = -1;
+	int32_t iCurCost, iBestCost = INT_MAX;
+
+	WelsIChormaPredV_c(pDstChroma, pDecCb, iDecStride);
+	WelsIChormaPredV_c(pDstChroma+64, pDecCr, iDecStride);
+	iCurCost = WelsSampleSatd8x8_c(pDstChroma, 8, pEncCb, iEncStride);
+	iCurCost += WelsSampleSatd8x8_c(pDstChroma+64, 8, pEncCr, iEncStride) + iLambda * 2;
+	
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 2;
+		iBestCost = iCurCost;
+	}
+	
+	WelsIChormaPredH_c(pDstChroma, pDecCb, iDecStride);
+	WelsIChormaPredH_c(pDstChroma+64, pDecCr, iDecStride);
+	iCurCost = WelsSampleSatd8x8_c(pDstChroma, 8, pEncCb, iEncStride);
+	iCurCost += WelsSampleSatd8x8_c(pDstChroma+64, 8, pEncCr, iEncStride) + iLambda * 2;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 1;
+		iBestCost = iCurCost;
+	}
+	WelsIChormaPredDc_c(pDstChroma, pDecCb, iDecStride);
+	WelsIChormaPredDc_c(pDstChroma+64, pDecCr, iDecStride);
+	iCurCost = WelsSampleSatd8x8_c(pDstChroma, 8, pEncCb, iEncStride);
+	iCurCost += WelsSampleSatd8x8_c(pDstChroma+64, 8, pEncCr, iEncStride);
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 0;
+		iBestCost = iCurCost;
+	}
+	
+	*pBestMode	= iBestMode;
+
+    return iBestCost;
+	
+
+}
+int32_t WelsSampleSadIntra8x8Combined3_c(uint8_t *pDecCb, int32_t iDecStride, uint8_t *pEncCb, int32_t iEncStride, 
+							int32_t *pBestMode, int32_t iLambda, uint8_t *pDstChroma,uint8_t *pDecCr,uint8_t *pEncCr)
+{
+	int32_t iBestMode = -1;
+	int32_t iCurCost, iBestCost = INT_MAX;
+	
+	WelsIChormaPredV_c(pDstChroma, pDecCb, iDecStride);
+	WelsIChormaPredV_c(pDstChroma+64, pDecCr, iDecStride);
+	iCurCost = WelsSampleSad8x8_c(pDstChroma, 8, pEncCb, iEncStride);
+	iCurCost += WelsSampleSad8x8_c(pDstChroma+64, 8, pEncCr, iEncStride) + iLambda * 2;
+	
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 2;
+		iBestCost = iCurCost;
+	}
+	
+	WelsIChormaPredH_c(pDstChroma, pDecCb, iDecStride);
+	WelsIChormaPredH_c(pDstChroma+64, pDecCr, iDecStride);
+	iCurCost = WelsSampleSad8x8_c(pDstChroma, 8, pEncCb, iEncStride);
+	iCurCost += WelsSampleSad8x8_c(pDstChroma+64, 8, pEncCr, iEncStride) + iLambda * 2;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 1;
+		iBestCost = iCurCost;
+	}
+	WelsIChormaPredDc_c(pDstChroma, pDecCb, iDecStride);
+	WelsIChormaPredDc_c(pDstChroma+64, pDecCr, iDecStride);
+	iCurCost = WelsSampleSad8x8_c(pDstChroma, 8, pEncCb, iEncStride);
+	iCurCost += WelsSampleSad8x8_c(pDstChroma+64, 8, pEncCr, iEncStride);
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 0;
+		iBestCost = iCurCost;
+	}
+
+	*pBestMode = iBestMode;
+
+    return iBestCost;
+
+}
+
+extern void WelsI16x16LumaPredDc_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+extern void WelsI16x16LumaPredH_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+extern void WelsI16x16LumaPredV_c(uint8_t *pPred, uint8_t *pRef, const int32_t iStride);
+
+int32_t WelsSampleSatdIntra16x16Combined3_c(uint8_t *pDec, int32_t iDecStride, uint8_t *pEnc, int32_t iEncStride, 
+							  int32_t *pBestMode, int32_t iLambda, uint8_t *pDst)
+{
+	int32_t iBestMode = -1;
+	int32_t iCurCost, iBestCost = INT_MAX;
+	
+	WelsI16x16LumaPredV_c(pDst, pDec, iDecStride);
+	iCurCost = WelsSampleSatd16x16_c(pDst, 16, pEnc, iEncStride);
+	
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 0;
+		iBestCost = iCurCost;
+	}
+	
+	WelsI16x16LumaPredH_c(pDst, pDec, iDecStride);
+	iCurCost = WelsSampleSatd16x16_c(pDst, 16, pEnc, iEncStride) + iLambda * 2;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 1;
+		iBestCost = iCurCost;
+	}
+	WelsI16x16LumaPredDc_c(pDst, pDec, iDecStride);
+	iCurCost = WelsSampleSatd16x16_c(pDst, 16, pEnc, iEncStride) + iLambda * 2;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 2;
+		iBestCost = iCurCost;
+	}
+	
+	*pBestMode = iBestMode;
+
+    return iBestCost;
+	
+	
+}
+int32_t WelsSampleSadIntra16x16Combined3_c(uint8_t *pDec, int32_t iDecStride, uint8_t *pEnc, int32_t iEncStride, 
+							  int32_t *pBestMode, int32_t iLambda, uint8_t *pDst)
+{
+	int32_t iBestMode = -1;
+	int32_t iCurCost, iBestCost = INT_MAX;
+	
+	WelsI16x16LumaPredV_c(pDst, pDec, iDecStride);
+	iCurCost = WelsSampleSad16x16_c(pDst, 16, pEnc, iEncStride);
+	
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 0;
+		iBestCost = iCurCost;
+	}
+	
+	WelsI16x16LumaPredH_c(pDst, pDec, iDecStride);
+	iCurCost = WelsSampleSad16x16_c(pDst, 16, pEnc, iEncStride) + iLambda * 2;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 1;
+		iBestCost = iCurCost;
+	}
+	WelsI16x16LumaPredDc_c(pDst, pDec, iDecStride);
+	iCurCost = WelsSampleSad16x16_c(pDst, 16, pEnc, iEncStride) + iLambda * 2;
+	if (iCurCost < iBestCost)
+	{			
+		iBestMode = 2;
+		iBestCost = iCurCost;
+	}
+	
+	*pBestMode = iBestMode;
+
+    return iBestCost;
+	
+	
+}
+
+void WelsInitSampleSadFunc( SWelsFuncPtrList *pFuncList, uint32_t uiCpuFlag)
+{
+	//pfSampleSad init
+	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_c;
+	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_c;
+	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16 ] = WelsSampleSad8x16_c;
+	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8  ] = WelsSampleSad8x8_c;
+	pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_c;
+
+	//pfSampleSatd init
+	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_c;
+	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_c;
+	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_c;
+	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_c;
+	pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_c;
+
+	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_c;
+	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_c;
+	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_c;
+	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_c;
+	pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_c;
+
+	pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd   = NULL;
+	pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd   = NULL;
+	pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Sad    = NULL;
+	pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = NULL;
+	pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad  = NULL;
+
+#if defined (X86_ASM)
+	if ( uiCpuFlag & WELS_CPU_MMXEXT )
+	{
+		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4  ] = WelsSampleSad4x4_mmx;
+	}	
+	
+	if ( uiCpuFlag & WELS_CPU_SSE2 )
+	{
+		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16] = WelsSampleSad16x16_sse2;
+		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x8 ] = WelsSampleSad16x8_sse2;
+		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16] = WelsSampleSad8x16_sse2;
+		pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8] = WelsSampleSad8x8_sse21;
+
+		pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_sse2;
+	    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_sse2;
+	    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_sse2;
+	    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_sse2;
+	    pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_sse2;
+		
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4  ] = WelsSampleSatd4x4_sse2;
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8  ] = WelsSampleSatd8x8_sse2;
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2;
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2;
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2;		
+       	pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd =  WelsSmpleSatdThree4x4_sse2;
+	}	
+
+	if (uiCpuFlag & WELS_CPU_SSSE3)
+	{
+		pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
+	}
+
+	if( uiCpuFlag & WELS_CPU_SSE41 )
+	{	   
+	    pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse41;
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8] = WelsSampleSatd16x8_sse41;
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41;
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41;
+		pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41;
+		pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
+		pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
+	}
+	
+#endif //(X86_ASM)
+
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -1,0 +1,229 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	set_mb_syn_cavlc.h
+ *
+ * \brief	Seting all syntax elements of mb and decoding residual with cavlc
+ *
+ * \date	05/19/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "set_mb_syn_cavlc.h"
+#include "svc_enc_golomb.h"
+#include "vlc_encoder.h"
+#include "cpu_core.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+SCoeffFunc    sCoeffFunc;
+
+const  ALIGNED_DECLARE(uint8_t, g_kuiZeroLeftMap[16], 16) = 
+{
+	0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+};
+
+const ALIGNED_DECLARE(uint8_t, g_kuiTrailingOneIndex[8], 16) = 
+{
+	3, 0, 1, 0, 2, 0, 1, 0
+};
+
+int32_t CavlcParamCal_c(int16_t *pCoffLevel, uint8_t *pRun, int16_t *pLevel, int32_t *pTotalCoeff , int32_t iLastIndex)
+{
+	int32_t iTotalZeros = 0;
+	int32_t iTotalCoeffs = 0;
+
+	while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
+		-- iLastIndex;
+	}
+	 
+	while (iLastIndex >= 0) {
+		int32_t iCountZero = 0;
+		pLevel[iTotalCoeffs] = pCoffLevel[iLastIndex--];   
+
+		while (iLastIndex >= 0 && pCoffLevel[iLastIndex] == 0) {
+			++ iCountZero;
+			-- iLastIndex;
+		}
+		iTotalZeros += iCountZero;
+		pRun[iTotalCoeffs++] = iCountZero;
+	}
+	*pTotalCoeff = iTotalCoeffs;
+	return iTotalZeros;
+}
+
+void  WriteBlockResidualCavlc( int16_t *pCoffLevel, int32_t iEndIdx, int32_t iCalRunLevelFlag, int32_t iResidualProperty, int8_t iNC, SBitStringAux *pBs )
+{		
+	ENFORCE_STACK_ALIGN_1D(int16_t, iLevel, 16, 16)
+	ENFORCE_STACK_ALIGN_1D(uint8_t, uiRun, 16, 16)
+
+	int32_t iTotalCoeffs = 0;
+	int32_t iTrailingOnes = 0;
+	int32_t iTotalZeros = 0, iZerosLeft = 0;
+	uint32_t uiSign = 0;
+	int32_t iLevelCode = 0, iLevelPrefix = 0, iLevelSuffix = 0, uiSuffixLength = 0, iLevelSuffixSize = 0;
+	int32_t iValue = 0, iThreshold, iZeroLeft;
+	int32_t n = 0;	
+	int32_t i = 0;
+
+
+	CAVLC_BS_INIT(pBs);
+
+    /*Step 1: calculate iLevel and iRun and total */ 
+
+	if( iCalRunLevelFlag ){
+		int32_t iCount = 0;
+		iTotalZeros = sCoeffFunc.pfCavlcParamCal(pCoffLevel, uiRun, iLevel, &iTotalCoeffs, iEndIdx);        
+		iCount = (iTotalCoeffs>3)?3:iTotalCoeffs;
+		for(i = 0;i <iCount ;i++)
+		{
+			if(WELS_ABS(iLevel[i]) == 1)
+			{
+				iTrailingOnes ++;
+				uiSign <<=1;
+				if(iLevel[i]<0)
+					uiSign|=1;
+			}
+			else
+			{
+				break;
+
+			}
+		}
+	}
+	/*Step 3: coeff token */
+	const uint8_t *upCoeffToken = &g_kuiVlcCoeffToken[g_kuiEncNcMapTable[iNC]][iTotalCoeffs][iTrailingOnes][0];
+	iValue = upCoeffToken[0];
+	n = upCoeffToken[1];	
+
+    if( iTotalCoeffs == 0 )
+    {
+		CAVLC_BS_WRITE(n, iValue);
+
+		CAVLC_BS_UNINIT(pBs);
+        return;
+    }	
+
+    /* Step 4: */
+   /*  trailing */
+	n += iTrailingOnes;
+	iValue = (iValue << iTrailingOnes) + uiSign;
+	CAVLC_BS_WRITE(n, iValue);
+
+    /*  levels */
+	uiSuffixLength = ( iTotalCoeffs > 10 && iTrailingOnes < 3 ) ? 1 : 0;	
+
+	for( i=iTrailingOnes; i<iTotalCoeffs; i++ ){
+		int32_t iVal = iLevel[i];
+
+			iLevelCode = (iVal-1)<<1;
+			uiSign = (iLevelCode>>31);
+			iLevelCode = (iLevelCode ^ uiSign) + (uiSign<<1);
+			iLevelCode -= ((i == iTrailingOnes) && (iTrailingOnes < 3)) << 1;
+
+			iLevelPrefix = iLevelCode >> uiSuffixLength; 
+			iLevelSuffixSize = uiSuffixLength;
+			iLevelSuffix = iLevelCode - (iLevelPrefix<<uiSuffixLength);
+
+			if (iLevelPrefix >= 14 && iLevelPrefix < 30 && uiSuffixLength == 0) {
+				iLevelPrefix = 14; 
+				iLevelSuffix = iLevelCode - iLevelPrefix;
+				iLevelSuffixSize = 4;
+			}
+			else if (iLevelPrefix >= 15) {
+				iLevelPrefix = 15; 
+				iLevelSuffix = iLevelCode - (iLevelPrefix << uiSuffixLength);
+
+				if (uiSuffixLength == 0) {
+					iLevelSuffix -= 15;
+				}
+				iLevelSuffixSize = 12;
+			}		
+
+			n = iLevelPrefix + 1 + iLevelSuffixSize;
+			iValue = ((1<< iLevelSuffixSize) | iLevelSuffix);
+			CAVLC_BS_WRITE(n, iValue);
+
+			uiSuffixLength += !uiSuffixLength;
+			iThreshold = 3 << ( uiSuffixLength - 1 );
+			uiSuffixLength += ((iVal > iThreshold) || (iVal < -iThreshold)) && (uiSuffixLength < 6);
+
+	}
+
+    /* Step 5: total zeros */
+
+    if( iTotalCoeffs < iEndIdx + 1 )
+    {
+		if ( CHROMA_DC != iResidualProperty )
+		{	
+			const uint8_t *upTotalZeros = &g_kuiVlcTotalZeros[iTotalCoeffs][iTotalZeros][0];
+			n = upTotalZeros[1];
+			iValue = upTotalZeros[0];
+			CAVLC_BS_WRITE( n, iValue );
+		}
+		else
+		{	
+			const uint8_t *upTotalZeros = &g_kuiVlcTotalZerosChromaDc[iTotalCoeffs][iTotalZeros][0];
+			n = upTotalZeros[1];
+			iValue = upTotalZeros[0];
+			CAVLC_BS_WRITE( n, iValue );	
+		}
+    }
+
+    /* Step 6: pRun before */	
+	iZerosLeft = iTotalZeros;
+    for( i = 0; i+1 < iTotalCoeffs && iZerosLeft > 0; ++ i )
+    {	
+		const uint8_t uirun = uiRun[i];
+		iZeroLeft = g_kuiZeroLeftMap[iZerosLeft];
+		n = g_kuiVlcRunBefore[iZeroLeft][uirun][1];
+		iValue = g_kuiVlcRunBefore[iZeroLeft][uirun][0];
+		CAVLC_BS_WRITE(n, iValue);		
+        iZerosLeft -= uirun;
+    }
+
+	CAVLC_BS_UNINIT(pBs);
+}
+
+
+void InitCoeffFunc( const uint32_t uiCpuFlag)
+{
+	sCoeffFunc.pfCavlcParamCal = CavlcParamCal_c;
+
+#if defined(X86_ASM)
+	if( uiCpuFlag & WELS_CPU_SSE2 ){
+		sCoeffFunc.pfCavlcParamCal = CavlcParamCal_sse2;
+	}
+#endif
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/slice_multi_threading.cpp
@@ -1,0 +1,1592 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	slice_multi_threading.h
+ *
+ * \brief	pSlice based multiple threading
+ *
+ * \date	04/16/2010 Created
+ *
+ *************************************************************************************
+ */
+
+#if defined(MT_ENABLED)
+
+#include <assert.h>
+#ifdef __GNUC__
+#include <semaphore.h>
+#ifndef SEM_NAME_MAX
+// length of semaphore name should be system constrained at least on mac 10.7
+#define  SEM_NAME_MAX 32
+#endif//SEM_NAME_MAX
+#endif//__GNUC__
+#include "slice_multi_threading.h"
+#include "mt_defs.h"
+#include "nal_encap.h"
+#include "utils.h"
+#include "encoder.h"
+#include "svc_encode_slice.h"
+#include "deblocking.h"
+#include "svc_enc_golomb.h"
+#include "crt_util_safe_x.h"	// for safe crt like calls
+#include "rc.h"
+
+#if defined(X86_ASM)
+#include "cpu.h"
+#endif//X86_ASM
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+#include "measure_time.h"
+#endif//DYNAMIC_SLICE_ASSIGN
+namespace WelsSVCEnc {
+void UpdateMbListNeighborParallel(	SSliceCtx *pSliceCtx,
+									  SMB *pMbList,
+									  const int32_t uiSliceIdc	)
+{
+	const uint8_t *kpMbMap			= pSliceCtx->pOverallMbMap;
+	const int32_t kiMbWidth			= pSliceCtx->iMbWidth;
+	int32_t iIdx						= pSliceCtx->pFirstMbInSlice[uiSliceIdc];
+	const int32_t kiEndMbInSlice	= iIdx + pSliceCtx->pCountMbNumInSlice[uiSliceIdc] - 1;
+	
+	do {
+		SMB *pMb							= &pMbList[iIdx];
+		uint32_t uiNeighborAvailFlag	= 0;
+		const int32_t kiMbXY				= pMb->iMbXY;
+		const int32_t kiMbX				= pMb->iMbX;
+		const int32_t kiMbY				= pMb->iMbY;
+		BOOL_T     bLeft;
+		BOOL_T     bTop;
+		BOOL_T     bLeftTop;
+		BOOL_T     bRightTop;		
+		int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;		
+
+		iLeftXY = kiMbXY - 1;
+		iTopXY = kiMbXY - kiMbWidth;
+		iLeftTopXY = iTopXY - 1;
+		iRightTopXY = iTopXY + 1;
+		
+		bLeft = (kiMbX > 0) && (uiSliceIdc == kpMbMap[iLeftXY]);
+		bTop = (kiMbY > 0) && (uiSliceIdc == kpMbMap[iTopXY]);
+		bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (uiSliceIdc == kpMbMap[iLeftTopXY]);
+		bRightTop = (kiMbX < (kiMbWidth-1)) && (kiMbY > 0) && (uiSliceIdc == kpMbMap[iRightTopXY]);		
+		
+		if( bLeft ){
+			uiNeighborAvailFlag |= LEFT_MB_POS;
+		}		
+		if( bTop ){
+			uiNeighborAvailFlag |= TOP_MB_POS;
+		}
+		if( bLeftTop ){
+			uiNeighborAvailFlag |= TOPLEFT_MB_POS;
+		}		
+		if( bRightTop ){
+			uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
+		}		
+		pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
+		pMb->uiSliceIdc		= uiSliceIdc;
+
+		++ iIdx;
+	} while(iIdx <= kiEndMbInSlice);
+}
+
+void CalcSliceComplexRatio( void *pRatio, SSliceCtx *pSliceCtx, uint32_t *pSliceConsume )
+{
+	float *pRatioList			= (float *)pRatio;
+	float fAvI[MAX_SLICES_NUM];
+	float fSumAv				= .0f;
+	uint32_t *pSliceTime		= (uint32_t *)pSliceConsume;
+	int32_t *pCountMbInSlice	= (int32_t *)pSliceCtx->pCountMbNumInSlice;	
+	const int32_t kiSliceCount	= pSliceCtx->iSliceNumInFrame;
+	int32_t iSliceIdx			= 0;
+
+#if defined(X86_ASM)
+	WelsEmms();
+#endif //X86_ASM
+	
+	while ( iSliceIdx < kiSliceCount )
+	{
+		fAvI[iSliceIdx]	= 1.0f * pCountMbInSlice[iSliceIdx] / pSliceTime[iSliceIdx];
+#if defined(ENABLE_TRACE_MT)
+		WelsLog(NULL, WELS_LOG_DEBUG, "[MT] CalcSliceComplexRatio(), pSliceConsumeTime[%d]= %d us, slice_run= %d\n", iSliceIdx, pSliceTime[iSliceIdx], pCountMbInSlice[iSliceIdx]);
+#endif//ENABLE_TRACE_MT
+		fSumAv += fAvI[iSliceIdx];
+		
+		++ iSliceIdx;
+	}
+	while ( -- iSliceIdx >= 0 )
+	{
+		pRatioList[iSliceIdx] = fAvI[iSliceIdx] / fSumAv;
+	}
+}
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN) && defined(NOT_ABSOLUTE_BALANCING)
+int32_t NeedDynamicAdjust( void *pConsumeTime, const int32_t iSliceNum )
+{	
+#if !defined(USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING)
+	const float fRatioLower	= TOLERANT_BALANCING_RATIO_LOWER( uiSliceNum );
+	const float fRatioUpper	= TOLERANT_BALANCING_RATIO_UPPER( uiSliceNum );
+#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+	uint32_t *pSliceConsume	= (uint32_t *)pConsumeTime;
+	uint32_t uiTotalConsume	= 0;
+	int32_t iSliceIdx		= 0;
+	int32_t iNeedAdj		= false;
+
+#if defined(X86_ASM)
+	WelsEmms();
+#endif //X86_ASM
+	
+	while( iSliceIdx < iSliceNum )
+	{
+		uiTotalConsume += pSliceConsume[iSliceIdx] + pSliceConsume[1+iSliceIdx];
+		iSliceIdx += 2;
+	}
+	if (uiTotalConsume == 0)
+	{
+#if defined(ENABLE_TRACE_MT)
+		WelsLog( NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein do no adjust due first picture, iCountSliceNum= %d\n", iSliceNum );
+#endif//ENABLE_TRACE_MT
+		return false;
+	}
+
+	iSliceIdx = 0;
+#if defined(USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING)
+	float fThr				= EPSN;	// threshold for various cores cases
+	float fRmse				= .0f;	// root mean square error of pSlice consume ratios
+	const float kfMeanRatio	= 1.0f / iSliceNum;
+	do{
+		const float fRatio = 1.0f * pSliceConsume[iSliceIdx] / uiTotalConsume;
+		const float fDiffRatio = fRatio - kfMeanRatio;
+		fRmse += (fDiffRatio * fDiffRatio);		
+		++ iSliceIdx;
+	} while ( iSliceIdx+1 < iSliceNum );	
+	fRmse = sqrtf(fRmse/iSliceNum);
+	if ( iSliceNum >= 8 )
+	{
+		fThr += THRESHOLD_RMSE_CORE8;		
+	}
+	else if ( iSliceNum >= 4 )
+	{
+		fThr += THRESHOLD_RMSE_CORE4;
+	}
+	else if ( iSliceNum >= 2 )
+	{
+		fThr += THRESHOLD_RMSE_CORE2;
+	}
+	else
+		fThr = 1.0f;
+	if ( fRmse > fThr )
+		iNeedAdj	= true;
+#if defined(ENABLE_TRACE_MT)
+	WelsLog(NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein adjustment decision is made (iNeedAdj= %d) by: fRmse of pSlice complexity ratios %.6f, the corresponding threshold %.6f, iCountSliceNum %d\n",
+		iNeedAdj, fRmse, fThr, iSliceNum);
+#endif//ENABLE_TRACE_MT
+#else
+	do{
+		const float kfRatio = 1.0f * pSliceConsume[uiSliceIdx] / uiTotalConsume;
+		if ( kfRatio+EPSN < fRatioLower || kfRatio > ratio_upper+EPSN )
+		{
+#if defined(ENABLE_TRACE_MT)
+			WelsLog(NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein adjustment decision is made by pSlice consume time not balanced at all, uiSliceIdx= %d, comp_ratio= %.6f, pSliceConsumeTime= %d, total_consume_time= %d, iCountSliceNum= %d\n",
+				uiSliceIdx, kfRatio, pSliceConsume[uiSliceIdx], uiTotalConsume, uiSliceNum);
+#endif//ENABLE_TRACE_MT
+			iNeedAdj = true;
+			break;
+		}
+		++ uiSliceIdx;
+	} while ( uiSliceIdx+1 < uiSliceNum );
+#endif//USE_RMSE_SLICE_COMPLEXITY_RATIO_FOR_BALANCING
+
+	return iNeedAdj;
+}
+#endif//..
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+void DynamicAdjustSlicing(	sWelsEncCtx *pCtx,
+								SDqLayer *pCurDqLayer,
+								void *pComplexRatio,
+								int32_t iCurDid )
+{	
+	SSliceCtx *pSliceCtx	= pCurDqLayer->pSliceEncCtx;
+	const int32_t kiCountSliceNum	= pSliceCtx->iSliceNumInFrame;	
+	const int32_t kiCountNumMb		= pSliceCtx->iMbNumInFrame;
+	int32_t iMinimalMbNum			= pSliceCtx->iMbWidth;	// in theory we need only 1 SMB, here let it as one SMB row required
+	int32_t iMaximalMbNum			= 0;	// dynamically assign later
+	float *pSliceComplexRatio	= (float *)pComplexRatio;	
+	int32_t iMbNumLeft					= kiCountNumMb;
+	int32_t iRunLen[MAX_THREADS_NUM]	= {0};
+	int32_t iSliceIdx					= 0;
+
+	int32_t iNumMbInEachGom;
+	SWelsSvcRc *pWelsSvcRc = &pCtx->pWelsSvcRc[iCurDid];
+	if(pCtx->pSvcParam->bEnableRc)
+	{
+		iNumMbInEachGom = pWelsSvcRc->iNumberMbGom;
+
+		if ( iNumMbInEachGom <= 0 )
+		{
+			WelsLog(pCtx, WELS_LOG_ERROR, "[MT] DynamicAdjustSlicing(), invalid iNumMbInEachGom= %d from RC, iDid= %d, iCountNumMb= %d\n", iNumMbInEachGom, iCurDid, kiCountNumMb);
+			return;
+		}
+
+		// do not adjust in case no extra iNumMbInEachGom based left for slicing adjustment,
+		// extra MB of non integrated GOM assigned at the last pSlice in default, keep up on early initial result.
+		if ( iNumMbInEachGom * kiCountSliceNum >= kiCountNumMb )
+		{
+			return;
+		}		
+		iMinimalMbNum	= iNumMbInEachGom;		
+	}
+	
+	if ( kiCountSliceNum < 2 || (kiCountSliceNum & 0x01) )	// we need suppose uiSliceNum is even for multiple threading
+		return;
+
+	iMaximalMbNum	= kiCountNumMb - (kiCountSliceNum - 1) * iMinimalMbNum;
+
+#if defined(X86_ASM)
+	WelsEmms();
+#endif //X86_ASM
+	
+#if defined(ENABLE_TRACE_MT)
+	WelsLog(pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iDid= %d, iCountNumMb= %d\n", iCurDid, kiCountNumMb);
+#endif//ENABLE_TRACE_MT
+
+	iSliceIdx	= 0;
+	while (iSliceIdx+1 < kiCountSliceNum) {
+		int32_t iNumMbAssigning = (int32_t)(kiCountNumMb * pSliceComplexRatio[iSliceIdx] + EPSN);			
+
+		// GOM boundary aligned
+		if(pCtx->pSvcParam->bEnableRc)
+		{
+			iNumMbAssigning=(int32_t)(1.0f * iNumMbAssigning / iNumMbInEachGom + 0.5f + EPSN) * iNumMbInEachGom;			
+		}
+
+		// make sure one GOM at least in each pSlice for safe
+		if ( iNumMbAssigning < iMinimalMbNum )
+			iNumMbAssigning	= iMinimalMbNum;
+		else if ( iNumMbAssigning > iMaximalMbNum )
+			iNumMbAssigning	= iMaximalMbNum;
+
+		assert( iNumMbAssigning > 0 );
+
+		iMbNumLeft -= iNumMbAssigning;
+		if ( iMbNumLeft <= 0 )	// error due to we can not support slice_skip now yet, do not adjust this time
+		{
+			assert( 0 );
+			return;
+		}
+		iRunLen[iSliceIdx]	= iNumMbAssigning;
+#if defined(ENABLE_TRACE_MT)
+		WelsLog(pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), uiSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d\n", 
+			iSliceIdx, pSliceComplexRatio[iSliceIdx], pSliceCtx->pCountMbNumInSlice[iSliceIdx], iNumMbAssigning);
+#endif//ENABLE_TRACE_MT
+		++ iSliceIdx;
+		iMaximalMbNum	= iMbNumLeft - (kiCountSliceNum - iSliceIdx - 1) * iMinimalMbNum;	// get maximal num_mb in left parts
+	}
+	iRunLen[iSliceIdx] = iMbNumLeft;
+#if defined(ENABLE_TRACE_MT)
+	WelsLog(pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d\n", 
+		iSliceIdx, pSliceComplexRatio[iSliceIdx], pSliceCtx->pCountMbNumInSlice[iSliceIdx], iMbNumLeft);
+#endif//ENABLE_TRACE_MT
+
+
+	if ( DynamicAdjustSlicePEncCtxAll( pSliceCtx, iRunLen ) == 0 )
+	{
+		const int32_t kiThreadNum	= pCtx->pSvcParam->iCountThreadsNum;
+		int32_t iThreadIdx			= 0;
+		do {
+#ifdef WIN32
+			WelsEventSignal( &pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx] );
+#else
+			WelsEventSignal( pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx] );
+#endif//WIN32
+			++ iThreadIdx;
+		} while(iThreadIdx < kiThreadNum);
+
+		WelsMultipleEventsWaitAllBlocking( kiThreadNum, &pCtx->pSliceThreading->pFinUpdateMbListEvent[0] );
+	}
+}
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+void ResetEnvMt( sWelsEncCtx *pCtx)
+{
+	const int16_t kiSliceCount = pCtx->iMaxSliceCount;
+	int32_t iIdx = 0;
+
+	while ( iIdx < kiSliceCount )
+	{
+		SWelsSliceBs *pSliceBs	= &pCtx->pSliceBs[iIdx];
+		pSliceBs->uiBsPos		= 0;	
+		++ iIdx;
+	}
+}
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+int32_t RequestMtResource( sWelsEncCtx **ppCtx, SWelsSvcCodingParam *pCodingParam, const int32_t iCountBsLen, const int32_t iTargetSpatialBsSize )
+{	
+	CMemoryAlign *pMa			= NULL;
+	SWelsSvcCodingParam *pPara= NULL;
+	SSliceThreading *pSmt		= NULL;
+	SWelsSliceBs *pSliceB		= NULL;
+	uint8_t *pBsBase			= NULL;
+	int32_t iNumSpatialLayers	= 0;
+	int32_t iThreadNum			= 0;	
+	int32_t iIdx					= 0;
+	int32_t iSliceBsBufferSize= 0;
+	int16_t iMaxSliceNum		= 1;
+	
+	if ( NULL == ppCtx || NULL == pCodingParam || NULL == *ppCtx || iCountBsLen <= 0 )
+		return 1;
+	
+	pMa	= (*ppCtx)->pMemAlign;
+	pPara= pCodingParam;
+	iNumSpatialLayers	= pPara->iNumDependencyLayer;
+	iThreadNum	= pPara->iCountThreadsNum;
+	iMaxSliceNum = (*ppCtx)->iMaxSliceCount;
+
+	pSmt	= (SSliceThreading *)pMa->WelsMalloc(sizeof(SSliceThreading), "SSliceThreading");
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt), FreeMemorySvc(ppCtx) )
+	(*ppCtx)->pSliceThreading	= pSmt;
+	pSmt->pThreadPEncCtx	= (SSliceThreadPrivateData *)pMa->WelsMalloc( sizeof(SSliceThreadPrivateData) * iThreadNum, "pThreadPEncCtx" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pThreadPEncCtx), FreeMemorySvc(ppCtx) )
+	pSmt->pThreadHandles	= (WELS_THREAD_HANDLE *)pMa->WelsMalloc( sizeof(WELS_THREAD_HANDLE) * iThreadNum, "pThreadHandles" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pThreadHandles), FreeMemorySvc(ppCtx) )
+
+#ifdef WIN32
+	pSmt->pSliceCodedEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pSliceCodedEvent" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pSliceCodedEvent), FreeMemorySvc(ppCtx) )
+	pSmt->pReadySliceCodingEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pReadySliceCodingEvent" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pReadySliceCodingEvent), FreeMemorySvc(ppCtx) )
+	pSmt->pFinSliceCodingEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pFinSliceCodingEvent" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pFinSliceCodingEvent), FreeMemorySvc(ppCtx) )
+#endif//WIN32
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(__GNUC__)
+	pSmt->pUpdateMbListThrdHandles	= (WELS_THREAD_HANDLE *)pMa->WelsMalloc( sizeof(WELS_THREAD_HANDLE) * iThreadNum, "pUpdateMbListThrdHandles" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pUpdateMbListThrdHandles), FreeMemorySvc(ppCtx) )	
+#endif//__GNUC__
+#ifdef WIN32
+	pSmt->pUpdateMbListEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pUpdateMbListEvent" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pUpdateMbListEvent), FreeMemorySvc(ppCtx) )
+	pSmt->pFinUpdateMbListEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pFinUpdateMbListEvent" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pFinUpdateMbListEvent), FreeMemorySvc(ppCtx) )
+#endif//WIN32
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef WIN32
+	pSmt->pExitEncodeEvent	= (WELS_EVENT *)pMa->WelsMalloc( sizeof(WELS_EVENT) * iThreadNum, "pExitEncodeEvent" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pExitEncodeEvent), FreeMemorySvc(ppCtx) )
+#endif//WIN32
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+	iIdx = 0;
+	while ( iIdx < iNumSpatialLayers )
+	{		
+		SMulSliceOption *pMso	= &pPara->sDependencyLayers[iIdx].sMso;
+		const int32_t kiSliceNum= pMso->sSliceArgument.iSliceNum;
+		if (pMso->uiSliceMode == SM_FIXEDSLCNUM_SLICE && pPara->iMultipleThreadIdc > 1 && pPara->iMultipleThreadIdc >= kiSliceNum )
+		{
+			pSmt->pSliceConsumeTime[iIdx]	= (uint32_t *)pMa->WelsMallocz( kiSliceNum * sizeof(uint32_t), "pSliceConsumeTime[]" );
+			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pSliceConsumeTime[iIdx]), FreeMemorySvc(ppCtx) )
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+			pSmt->pSliceComplexRatio[iIdx]	= (float *)pMa->WelsMalloc( kiSliceNum * sizeof(float), "pSliceComplexRatio[]" );
+			WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pSliceComplexRatio[iIdx]), FreeMemorySvc(ppCtx) )
+#endif//TRY_SLICING_BALANCE
+		}
+		else
+		{
+			pSmt->pSliceConsumeTime[iIdx]	= NULL;
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+			pSmt->pSliceComplexRatio[iIdx]	= NULL;
+#endif//TRY_SLICING_BALANCE
+		}		
+		++ iIdx;
+	}
+	// NULL for pSliceConsumeTime[iIdx]: iIdx from iNumSpatialLayers to MAX_DEPENDENCY_LAYERS	
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+	
+#ifdef MT_DEBUG
+	// file handle for MT debug
+	pSmt->pFSliceDiff = NULL;
+
+	if ( pSmt->pFSliceDiff )
+	{
+		fclose( pSmt->pFSliceDiff );
+		pSmt->pFSliceDiff = NULL;
+	}
+#ifdef WIN32
+	pSmt->pFSliceDiff	= fopen(".\\slice_time.txt", "wt+" );
+#else
+	pSmt->pFSliceDiff	= fopen("/tmp/slice_time.txt", "wt+" );
+#endif//WIN32
+#endif//MT_DEBUG
+	
+#if defined(ENABLE_TRACE_MT)
+	WelsLog((*ppCtx), WELS_LOG_INFO, "encpEncCtx= 0x%p\n", (void *)(*ppCtx));
+#endif//ENABLE_TRACE_MT
+
+	iIdx = 0;
+	while ( iIdx < iThreadNum )
+	{
+#ifdef __GNUC__	// for posix threading
+		str_t name[SEM_NAME_MAX] = {0};
+		int32_t used_len = 0;
+		WELS_THREAD_ERROR_CODE err = 0;
+#endif//__GNUC__
+		pSmt->pThreadPEncCtx[iIdx].pWelsPEncCtx	= (void *)(*ppCtx);
+		pSmt->pThreadPEncCtx[iIdx].iSliceIndex	= iIdx;
+		pSmt->pThreadPEncCtx[iIdx].iThreadIndex	= iIdx;
+		pSmt->pThreadHandles[iIdx]				= 0;
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#ifdef WIN32
+		WelsEventInit( &pSmt->pUpdateMbListEvent[iIdx] );
+		WelsEventInit( &pSmt->pFinUpdateMbListEvent[iIdx] );		
+#else
+		// length of semaphore name should be system constrained at least on mac 10.7
+		SNPRINTF( name, SEM_NAME_MAX, "ud%d%p", iIdx, (void *)(*ppCtx) );
+		err = WelsEventOpen( &pSmt->pUpdateMbListEvent[iIdx], name );
+#if defined(ENABLE_TRACE_MT)
+		WelsLog((*ppCtx), WELS_LOG_INFO, "[MT] Open pUpdateMbListEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
+#endif
+		used_len = SNPRINTF( name, SEM_NAME_MAX, "fu%d%p", iIdx, (void *)(*ppCtx) );
+		name[used_len] = '\0';
+		err = WelsEventOpen( &pSmt->pFinUpdateMbListEvent[iIdx], name );
+#if defined(ENABLE_TRACE_MT)
+		WelsLog((*ppCtx), WELS_LOG_INFO, "[MT] Open pFinUpdateMbListEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
+#endif
+#endif//WIN32
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+		
+#ifdef WIN32
+		WelsEventInit( &pSmt->pSliceCodedEvent[iIdx] );
+		WelsEventInit( &pSmt->pReadySliceCodingEvent[iIdx] );
+		WelsEventInit( &pSmt->pFinSliceCodingEvent[iIdx] );
+		WelsEventInit( &pSmt->pExitEncodeEvent[iIdx] );
+#else
+		used_len = SNPRINTF( name, SEM_NAME_MAX, "sc%d%p", iIdx, (void *)(*ppCtx) );
+		name[used_len] = '\0';
+		err = WelsEventOpen( &pSmt->pSliceCodedEvent[iIdx], name );
+#if defined(ENABLE_TRACE_MT)
+		WelsLog((*ppCtx), WELS_LOG_INFO, "[MT] Open pSliceCodedEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno);
+#endif
+		used_len = SNPRINTF( name, SEM_NAME_MAX, "rc%d%p", iIdx, (void *)(*ppCtx) );
+		name[used_len] = '\0';
+		err = WelsEventOpen( &pSmt->pReadySliceCodingEvent[iIdx], name );		
+#if defined(ENABLE_TRACE_MT)
+		WelsLog((*ppCtx), WELS_LOG_INFO, "[MT] Open pReadySliceCodingEvent%d = 0x%p named(%s) ret%d err%d\n", iIdx, (void *)pSmt->pReadySliceCodingEvent[iIdx]), (void *)(*ppCtx), err, errno);
+#endif
+#endif//WIN32
+
+		++ iIdx;
+	}
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+	pSmt->pCountBsSizeInPartition	= (uint32_t *)pMa->WelsMalloc( sizeof(uint32_t) * iThreadNum, "pCountBsSizeInPartition" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSmt->pCountBsSizeInPartition), FreeMemorySvc(ppCtx) )
+#endif//PACKING_ONE_SLICE_PER_LAYER	
+
+	WelsMutexInit( &pSmt->mutexSliceNumUpdate );
+	
+	(*ppCtx)->pSliceBs	= (SWelsSliceBs *)pMa->WelsMalloc( sizeof(SWelsSliceBs) * iMaxSliceNum, "pSliceBs" );
+	WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == (*ppCtx)->pSliceBs), FreeMemorySvc(ppCtx) )
+	
+	pBsBase		= (*ppCtx)->pFrameBs + iCountBsLen;
+	pSliceB	= (*ppCtx)->pSliceBs;
+	iSliceBsBufferSize	= iTargetSpatialBsSize;
+	iIdx = 0;
+	while ( iIdx < iMaxSliceNum )
+	{
+		pSliceB->pBsBuffer	= (uint8_t *)pMa->WelsMalloc( iSliceBsBufferSize, "pSliceB->pBsBuffer" );
+
+		WELS_VERIFY_RETURN_PROC_IF( 1, (NULL == pSliceB->pBsBuffer), FreeMemorySvc(ppCtx) )
+		pSliceB->uiSize	= iSliceBsBufferSize;
+		
+		if ( iIdx > 0 )
+		{
+			pSliceB->pBs		= pBsBase;
+			pSliceB->uiBsPos	= 0;
+			pBsBase				+= iSliceBsBufferSize;
+		}
+		else
+		{
+			pSliceB->pBs		= NULL;
+			pSliceB->uiBsPos	= 0;
+		}
+		++ pSliceB;
+		++ iIdx;
+	}
+
+#if defined(ENABLE_TRACE_MT)
+	WelsLog((*ppCtx), WELS_LOG_INFO, "RequestMtResource(), iThreadNum=%d, iCountSliceNum= %d\n", pPara->iCountThreadsNum, iMaxSliceNum);
+#endif
+	
+	return 0;
+}
+
+void ReleaseMtResource( sWelsEncCtx **ppCtx )
+{
+	SWelsSliceBs *pSliceB			= NULL;
+	SWelsSvcCodingParam *pCodingParam	= NULL;
+	SSliceThreading *pSmt			= NULL;
+	CMemoryAlign *pMa				= NULL;	
+	int32_t iIdx						= 0;
+	int32_t iThreadNum				= 0;
+	int16_t uiSliceNum				= 0;
+
+	if ( NULL == ppCtx || NULL == *ppCtx )
+		return;
+
+	pMa			= (*ppCtx)->pMemAlign;
+	pCodingParam		= (*ppCtx)->pSvcParam;
+	uiSliceNum	= (*ppCtx)->iMaxSliceCount;
+	iThreadNum	= (*ppCtx)->pSvcParam->iCountThreadsNum;
+	pSmt		= (*ppCtx)->pSliceThreading;
+
+	if ( NULL == pSmt )
+		return;
+
+	while ( iIdx < iThreadNum) {
+#ifdef WIN32
+		if ( pSmt->pThreadHandles != NULL && pSmt->pThreadHandles[iIdx] != NULL )
+			WelsThreadDestroy( &pSmt->pThreadHandles[iIdx] );
+
+		if ( pSmt->pSliceCodedEvent != NULL )
+			WelsEventDestroy( &pSmt->pSliceCodedEvent[iIdx] );
+		if ( pSmt->pReadySliceCodingEvent != NULL )
+			WelsEventDestroy( &pSmt->pReadySliceCodingEvent[iIdx] );
+		if ( pSmt->pFinSliceCodingEvent != NULL )
+			WelsEventDestroy( &pSmt->pFinSliceCodingEvent[iIdx] );
+		if ( pSmt->pExitEncodeEvent != NULL )
+			WelsEventDestroy( &pSmt->pExitEncodeEvent[iIdx] );
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+		if ( pSmt->pUpdateMbListEvent != NULL )
+			WelsEventDestroy( &pSmt->pUpdateMbListEvent[iIdx] );
+		if ( pSmt->pFinUpdateMbListEvent != NULL )
+			WelsEventDestroy( &pSmt->pFinUpdateMbListEvent[iIdx] );
+#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+#else	
+		str_t ename[SEM_NAME_MAX] = {0};
+		int32_t used_len = 0;
+		// length of semaphore name should be system constrained at least on mac 10.7
+		SNPRINTF( ename, SEM_NAME_MAX, "sc%d%p", iIdx, (void *)(*ppCtx) );
+		WelsEventClose( pSmt->pSliceCodedEvent[iIdx], ename );
+		used_len = SNPRINTF( ename, SEM_NAME_MAX, "rc%d%p", iIdx, (void *)(*ppCtx) );
+		ename[used_len] = '\0';
+		WelsEventClose( pSmt->pReadySliceCodingEvent[iIdx], ename );
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+		used_len = SNPRINTF( ename, SEM_NAME_MAX, "ud%d%p", iIdx, (void *)(*ppCtx) );
+		ename[used_len] = '\0';
+		WelsEventClose( pSmt->pUpdateMbListEvent[iIdx], ename );
+		used_len = SNPRINTF( ename, SEM_NAME_MAX, "fu%d%p", iIdx, (void *)(*ppCtx) );
+		ename[used_len] = '\0';
+		WelsEventClose( pSmt->pFinUpdateMbListEvent[iIdx], ename );
+#endif//DYNAMIC_SLICE_ASSIGN && TRY_SLICING_BALANCE
+#endif//WIN32		
+
+		++ iIdx;
+	}
+
+#ifdef WIN32
+	if ( pSmt->pExitEncodeEvent != NULL )
+	{
+		pMa->WelsFree( pSmt->pExitEncodeEvent, "pExitEncodeEvent" );
+		pSmt->pExitEncodeEvent = NULL;
+	}
+	if ( pSmt->pSliceCodedEvent != NULL )
+	{
+		pMa->WelsFree( pSmt->pSliceCodedEvent, "pSliceCodedEvent" );
+		pSmt->pSliceCodedEvent = NULL;
+	}
+	if ( pSmt->pReadySliceCodingEvent != NULL )
+	{
+		pMa->WelsFree( pSmt->pReadySliceCodingEvent, "pReadySliceCodingEvent" );
+		pSmt->pReadySliceCodingEvent = NULL;
+	}
+	if ( pSmt->pFinSliceCodingEvent != NULL )
+	{
+		pMa->WelsFree( pSmt->pFinSliceCodingEvent, "pFinSliceCodingEvent" );
+		pSmt->pFinSliceCodingEvent = NULL;
+	}
+#endif//WIN32
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+	if ( NULL != pSmt->pCountBsSizeInPartition )
+	{
+		pMa->WelsFree( pSmt->pCountBsSizeInPartition, "pCountBsSizeInPartition" );
+		pSmt->pCountBsSizeInPartition = NULL;
+	}
+#endif//PACKING_ONE_SLICE_PER_LAYER
+	WelsMutexDestroy( &pSmt->mutexSliceNumUpdate );
+
+	if ( pSmt->pThreadPEncCtx != NULL )
+	{
+		pMa->WelsFree( pSmt->pThreadPEncCtx, "pThreadPEncCtx" );
+		pSmt->pThreadPEncCtx = NULL;
+	}
+	if ( pSmt->pThreadHandles != NULL )
+	{
+		pMa->WelsFree( pSmt->pThreadHandles, "pThreadHandles" );
+		pSmt->pThreadHandles = NULL;
+	}
+	
+	pSliceB = (*ppCtx)->pSliceBs;
+	iIdx = 0;
+	while ( pSliceB != NULL && iIdx < uiSliceNum )
+	{		
+		if ( pSliceB->pBsBuffer )
+		{
+			pMa->WelsFree( pSliceB->pBsBuffer, "pSliceB->pBsBuffer" );
+			pSliceB->pBsBuffer = NULL;
+			pSliceB->uiSize = 0;
+		}		
+		++ iIdx;
+		++ pSliceB;
+	}
+	if ( (*ppCtx)->pSliceBs != NULL )
+	{
+		pMa->WelsFree( (*ppCtx)->pSliceBs, "pSliceBs" );
+		(*ppCtx)->pSliceBs = NULL;
+	}
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+	if ( pSmt->pSliceConsumeTime != NULL )
+	{
+		iIdx = 0;
+		while (iIdx < pCodingParam->iNumDependencyLayer)
+		{
+			if ( pSmt->pSliceConsumeTime[iIdx] )
+			{
+				pMa->WelsFree( pSmt->pSliceConsumeTime[iIdx], "pSliceConsumeTime[]" );
+				pSmt->pSliceConsumeTime[iIdx] = NULL;
+			}
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+			if ( pSmt->pSliceComplexRatio[iIdx] != NULL )
+			{
+				pMa->WelsFree( pSmt->pSliceComplexRatio[iIdx], "pSliceComplexRatio[]" );
+				pSmt->pSliceComplexRatio[iIdx] = NULL;
+			}
+#endif//TRY_SLICING_BALANCE
+			++ iIdx;
+		}		
+	}
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)	
+
+#ifdef WIN32
+	if ( pSmt->pUpdateMbListEvent != NULL )
+	{
+		pMa->WelsFree( pSmt->pUpdateMbListEvent, "pUpdateMbListEvent" );
+		pSmt->pUpdateMbListEvent = NULL;
+	}
+	if ( pSmt->pFinUpdateMbListEvent != NULL )
+	{
+		pMa->WelsFree( pSmt->pFinUpdateMbListEvent, "pFinUpdateMbListEvent" );
+		pSmt->pFinUpdateMbListEvent = NULL;
+	}
+#else
+	if ( pSmt->pUpdateMbListThrdHandles )
+	{
+		pMa->WelsFree( pSmt->pUpdateMbListThrdHandles, "pUpdateMbListThrdHandles" );
+		pSmt->pUpdateMbListThrdHandles = NULL;
+	}
+#endif//WIN32
+
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#ifdef MT_DEBUG
+	// file handle for debug
+	if ( pSmt->pFSliceDiff )
+	{
+		fclose( pSmt->pFSliceDiff );
+		pSmt->pFSliceDiff = NULL;
+	}
+#endif//MT_DEBUG
+	pMa->WelsFree((*ppCtx)->pSliceThreading, "SSliceThreading");
+	(*ppCtx)->pSliceThreading = NULL;
+}
+
+int32_t AppendSliceToFrameBs( sWelsEncCtx *pCtx, SLayerBSInfo *pLbi, const int32_t iSliceCount )
+{	
+	SWelsSvcCodingParam *pCodingParam	= pCtx->pSvcParam;
+	SDLayerParam *pDlp				= &pCodingParam->sDependencyLayers[pCtx->uiDependencyId];
+	SWelsSliceBs *pSliceBs			= NULL;
+	const BOOL_T kbIsDynamicSlicingMode	= (pDlp->sMso.uiSliceMode == SM_DYN_SLICE);
+	int32_t iLayerSize					= 0;
+	int32_t iNalIdxBase				= pLbi->iNalCount;
+	int32_t iSliceIdx					= 0;	
+
+	if ( !kbIsDynamicSlicingMode )
+	{
+		pSliceBs	= &pCtx->pSliceBs[0];
+		iLayerSize	= pSliceBs->uiBsPos;	// assign with base pSlice first			
+		iSliceIdx	= 1;				// pSlice 0 bs has been written to pFrameBs yet by now, so uiSliceIdx base should be 1
+		while (iSliceIdx < iSliceCount)
+		{
+			++ pSliceBs;
+			if ( pSliceBs != NULL && pSliceBs->uiBsPos > 0 )
+			{
+				int32_t iNalIdx = 0;
+				const int32_t iCountNal	= pSliceBs->iNalIndex;
+
+#if MT_DEBUG_BS_WR
+				assert(pSliceBs->bSliceCodedFlag);
+#endif//MT_DEBUG_BS_WR
+
+				memmove(pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos);	// confirmed_safe_unsafe_usage
+				pCtx->iPosBsBuffer += pSliceBs->uiBsPos;
+
+				iLayerSize += pSliceBs->uiBsPos;
+
+				while (iNalIdx < iCountNal)
+				{
+					pLbi->iNalLengthInByte[iNalIdxBase+iNalIdx]	= pSliceBs->iNalLen[iNalIdx];
+					++ iNalIdx;
+				}
+				pLbi->iNalCount	+= iCountNal;
+				iNalIdxBase	+= iCountNal;
+			}		
+			++ iSliceIdx;		
+		}
+	}
+	else	// for SM_DYN_SLICE
+	{
+		const int32_t kiPartitionCnt	= iSliceCount;
+		int32_t iPartitionIdx		= 0;
+		
+		// due partition_0 has been written to pFrameBsBuffer
+		// so iLayerSize need add it
+		while ( iPartitionIdx < kiPartitionCnt )
+		{
+			const int32_t kiCountSlicesCoded = pCtx->pCurDqLayer->pNumSliceCodedOfPartition[iPartitionIdx];
+			int32_t iIdx = 0;
+
+			iSliceIdx	= iPartitionIdx;
+			while(iIdx < kiCountSlicesCoded)
+			{
+				pSliceBs	= &pCtx->pSliceBs[iSliceIdx];
+				if ( pSliceBs != NULL && pSliceBs->uiBsPos > 0 )
+				{
+					if ( iPartitionIdx > 0 )
+					{
+						int32_t iNalIdx = 0;
+						const int32_t iCountNal	= pSliceBs->iNalIndex;
+
+						memmove(pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos);	// confirmed_safe_unsafe_usage
+						pCtx->iPosBsBuffer += pSliceBs->uiBsPos;
+
+						iLayerSize += pSliceBs->uiBsPos;
+
+						while (iNalIdx < iCountNal)
+						{
+							pLbi->iNalLengthInByte[iNalIdxBase+iNalIdx]	= pSliceBs->iNalLen[iNalIdx];
+							++ iNalIdx;
+						}
+						pLbi->iNalCount	+= iCountNal;
+						iNalIdxBase	+= iCountNal;
+					}
+					else
+					{
+						iLayerSize	+= pSliceBs->uiBsPos;
+					}
+				}
+
+				iSliceIdx += kiPartitionCnt;
+				++ iIdx;
+			}			
+			++ iPartitionIdx;
+		}
+	}
+
+	return iLayerSize;
+}
+
+int32_t WriteSliceToFrameBs( sWelsEncCtx *pCtx, SLayerBSInfo *pLbi, uint8_t *pFrameBsBuffer, const int32_t iSliceIdx )
+{
+	SWelsSliceBs *pSliceBs			= &pCtx->pSliceBs[iSliceIdx];
+	SNalUnitHeaderExt *pNalHdrExt= &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;	
+	uint8_t *pDst					= pFrameBsBuffer;
+	int32_t pNalLen[2];
+	int32_t iSliceSize				= 0;	
+	const int32_t kiNalCnt			= pSliceBs->iNalIndex;	
+	int32_t iNalIdx					= 0;
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+	const int32_t iFirstSlice		= (iSliceIdx == 0);
+	int32_t iNalBase				= iFirstSlice ? 0 : pLbi->iNalCount;
+#else
+	int32_t iNalBase				= 0;
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+	
+	while ( iNalIdx < kiNalCnt ) {
+		iSliceSize += WelsEncodeNalExt( &pSliceBs->sNalList[iNalIdx], pNalHdrExt, pDst, &pNalLen[iNalIdx] );
+		pDst += pNalLen[iNalIdx];
+		pLbi->iNalLengthInByte[iNalBase+iNalIdx]	= pNalLen[iNalIdx];
+		
+		++ iNalIdx;
+	}	
+	
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+	pSliceBs->uiBsPos	= iSliceSize;
+	if ( iFirstSlice )
+	{
+		// pBsBuffer has been updated at coding_slice_0_in_encoder_mother_thread()
+		pLbi->uiLayerType		= VIDEO_CODING_LAYER;
+		pLbi->uiSpatialId		= pNalHdrExt->uiDependencyId;
+		pLbi->uiTemporalId	= pNalHdrExt->uiTemporalId;
+		pLbi->uiQualityId		= 0;
+		pLbi->uiPriorityId	= 0;	
+		pLbi->iNalCount		= kiNalCnt;
+	}
+	else
+	{
+		pLbi->iNalCount		+= kiNalCnt;
+	}
+#else
+	pLbi->uiLayerType		= VIDEO_CODING_LAYER;
+	pLbi->uiSpatialId		= pNalHdrExt->uiDependencyId;
+	pLbi->uiTemporalId	= pNalHdrExt->uiTemporalId;
+	pLbi->uiQualityId		= 0;
+	pLbi->uiPriorityId	= 0;	
+	pLbi->iNalCount		= kiNalCnt;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+	
+	return iSliceSize;
+}
+
+int32_t WriteSliceBs( sWelsEncCtx *pCtx, uint8_t *pSliceBsBuf, const int32_t iSliceIdx )
+{
+	SWelsSliceBs *pSliceBs			= &pCtx->pSliceBs[iSliceIdx];
+	SNalUnitHeaderExt *pNalHdrExt= &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt;	
+	uint8_t *pDst					= pSliceBsBuf;
+	int32_t *pNalLen				= &pSliceBs->iNalLen[0];
+	int32_t iSliceSize				= 0;
+	const int32_t kiNalCnt			= pSliceBs->iNalIndex;	
+	int32_t iNalIdx					= 0;	
+
+	assert( kiNalCnt <= 2 );
+	if ( kiNalCnt > 2 )
+		return 0;
+	
+	while ( iNalIdx < kiNalCnt ) {
+		iSliceSize += WelsEncodeNalExt( &pSliceBs->sNalList[iNalIdx], pNalHdrExt, pDst, &pNalLen[iNalIdx] );
+		pDst += pNalLen[iNalIdx];		
+		
+		++ iNalIdx;
+	}
+	pSliceBs->uiBsPos	= iSliceSize;
+	
+	return iSliceSize;
+}
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(__GNUC__)
+WELS_THREAD_ROUTINE_TYPE UpdateMbListThreadProc( void *arg )
+{
+	SSliceThreadPrivateData *pPrivateData	= (SSliceThreadPrivateData *)arg;
+	sWelsEncCtx *pEncPEncCtx			= NULL;
+	SDqLayer *pCurDq							= NULL;
+	int32_t iSliceIdx							= -1;
+	int32_t iEventIdx							= -1;
+	WELS_THREAD_ERROR_CODE iWaitRet				= WELS_THREAD_ERROR_GENERIAL;
+	uint32_t uiThrdRet							= 0;
+	
+	if ( NULL == pPrivateData )
+		WELS_THREAD_ROUTINE_RETURN(1);
+
+	pEncPEncCtx	= (sWelsEncCtx *)pPrivateData->pWelsPEncCtx;	
+	iSliceIdx		= pPrivateData->iSliceIndex;
+	iEventIdx		= pPrivateData->iThreadIndex;
+
+	do {
+#if defined(ENABLE_TRACE_MT)
+		WelsLog(pEncPEncCtx, WELS_LOG_INFO, "[MT] UpdateMbListThreadProc(), try to wait (pUpdateMbListEvent[%d])!\n", iEventIdx);
+#endif
+		iWaitRet = WelsEventWait( pEncPEncCtx->pSliceThreading->pUpdateMbListEvent[iEventIdx] );
+		if ( WELS_THREAD_ERROR_WAIT_OBJECT_0 == iWaitRet )
+		{
+			pCurDq			= pEncPEncCtx->pCurDqLayer;
+			UpdateMbListNeighborParallel( pCurDq->pSliceEncCtx, pCurDq->sMbDataP, iSliceIdx );
+			WelsEventSignal( pEncPEncCtx->pSliceThreading->pFinUpdateMbListEvent[iEventIdx] );	// mean finished update pMb list for this pSlice
+		}
+		else
+		{
+			WelsLog(pEncPEncCtx, WELS_LOG_WARNING, "[MT] UpdateMbListThreadProc(), waiting pUpdateMbListEvent[%d] failed(%d) and thread%d terminated!\n", iEventIdx, iWaitRet, iEventIdx);
+			uiThrdRet = 1;
+			break;
+		}
+	} while(1);
+
+	WELS_THREAD_ROUTINE_RETURN(uiThrdRet);
+}
+#endif//__GNUC__
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+// thread process for coding one pSlice
+WELS_THREAD_ROUTINE_TYPE CodingSliceThreadProc( void *arg )
+{
+	SSliceThreadPrivateData *pPrivateData	= (SSliceThreadPrivateData *)arg;
+	sWelsEncCtx *pEncPEncCtx			= NULL;
+	SDqLayer *pCurDq							= NULL;
+	SSlice *pSlice								= NULL;
+	SWelsSliceBs *pSliceBs						= NULL;
+#ifdef WIN32
+	WELS_EVENT pEventsList[3];
+	int32_t iEventCount						= 0;
+#endif
+	WELS_THREAD_ERROR_CODE iWaitRet				= WELS_THREAD_ERROR_GENERIAL;
+	uint32_t uiThrdRet							= 0;
+	int32_t iSliceSize							= 0;
+	int32_t iSliceIdx							= -1;
+	int32_t iThreadIdx							= -1;
+	int32_t iEventIdx							= -1;
+	bool_t bNeedPrefix							= false;
+	EWelsNalUnitType eNalType						= NAL_UNIT_UNSPEC_0;
+	EWelsNalRefIdc eNalRefIdc						= NRI_PRI_LOWEST;	
+
+	if ( NULL == pPrivateData )
+		WELS_THREAD_ROUTINE_RETURN(1);
+
+	WelsSetThreadCancelable();
+	
+	pEncPEncCtx	= (sWelsEncCtx *)pPrivateData->pWelsPEncCtx;
+	
+	iThreadIdx		= pPrivateData->iThreadIndex;	
+	iEventIdx		= iThreadIdx;
+	
+#ifdef WIN32
+	pEventsList[iEventCount++]	= pEncPEncCtx->pSliceThreading->pReadySliceCodingEvent[iEventIdx];
+	pEventsList[iEventCount++]	= pEncPEncCtx->pSliceThreading->pExitEncodeEvent[iEventIdx];
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+	pEventsList[iEventCount++] = pEncPEncCtx->pSliceThreading->pUpdateMbListEvent[iEventIdx];
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)	
+#endif//WIN32
+
+	do {
+#ifdef WIN32
+		iWaitRet = WelsMultipleEventsWaitSingleBlocking(	iEventCount,
+															&pEventsList[0],
+															(uint32_t)-1	);	// blocking until at least one event is 
+#else
+#if defined(ENABLE_TRACE_MT)
+		WelsLog(pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), try to call WelsEventWait(pReadySliceCodingEvent[%d]= 0x%p), pEncPEncCtx= 0x%p!\n", iEventIdx, (void *)(pEncPEncCtx->pReadySliceCodingEvent[iEventIdx]), (void *)pEncPEncCtx );
+#endif
+		iWaitRet = WelsEventWait( pEncPEncCtx->pSliceThreading->pReadySliceCodingEvent[iEventIdx] );
+#endif//WIN32		
+		if ( WELS_THREAD_ERROR_WAIT_OBJECT_0 == iWaitRet )	// start pSlice coding signal waited
+		{
+			SLayerBSInfo *pLbi = pPrivateData->pLayerBs;
+			const int32_t kiCurDid			= pEncPEncCtx->uiDependencyId;
+			const int32_t kiCurTid			= pEncPEncCtx->uiTemporalId;
+			SWelsSvcCodingParam *pCodingParam	= pEncPEncCtx->pSvcParam;
+			SDLayerParam *pParamD			= &pCodingParam->sDependencyLayers[kiCurDid];
+
+			pCurDq			= pEncPEncCtx->pCurDqLayer;			
+			eNalType		= pEncPEncCtx->eNalType;
+			eNalRefIdc		= pEncPEncCtx->eNalPriority;
+			bNeedPrefix		= pEncPEncCtx->bNeedPrefixNalFlag;			
+			
+			if ( pParamD->sMso.uiSliceMode != SM_DYN_SLICE )
+			{
+				int64_t iSliceStart	= 0;
+				bool_t bDsaFlag = false;
+				iSliceIdx		= pPrivateData->iSliceIndex;
+				pSlice			= &pCurDq->sLayerInfo.pSliceInLayer[iSliceIdx];
+				pSliceBs		= &pEncPEncCtx->pSliceBs[iSliceIdx];
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+				bDsaFlag	= (pParamD->sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && 
+							   pCodingParam->iMultipleThreadIdc > 1 &&
+							   pCodingParam->iMultipleThreadIdc >= pParamD->sMso.sSliceArgument.iSliceNum);
+				if ( bDsaFlag )
+					iSliceStart = WelsTime();
+#endif//DYNAMIC_SLICE_ASSIGN || MT_DEBUG
+
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+				pSliceBs->uiBsPos	= 0;
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+				pSliceBs->iNalIndex	= 0;
+				assert( (void*)(&pSliceBs->sBsWrite) == (void*)pSlice->pSliceBsa );
+				InitBits( &pSliceBs->sBsWrite, pSliceBs->pBsBuffer, pSliceBs->uiSize );
+
+#if MT_DEBUG_BS_WR
+				pSliceBs->bSliceCodedFlag	= FALSE;
+#endif//MT_DEBUG_BS_WR
+			
+				if ( bNeedPrefix )
+				{
+					if ( eNalRefIdc != NRI_PRI_LOWEST )
+					{
+						WelsLoadNalForSlice( pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc );
+						WelsWriteSVCPrefixNal( &pSliceBs->sBsWrite, eNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == eNalType) );
+						WelsUnloadNalForSlice( pSliceBs );			
+					}
+					else // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
+					{
+						WelsLoadNalForSlice( pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc );
+						// No need write any syntax of prefix NAL Unit RBSP here
+						WelsUnloadNalForSlice( pSliceBs );			
+					}
+				}
+				
+				WelsLoadNalForSlice( pSliceBs, eNalType, eNalRefIdc );
+
+				WelsCodeOneSlice( pEncPEncCtx, iSliceIdx, eNalType );			
+
+				WelsUnloadNalForSlice( pSliceBs );
+
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+				if ( 0 == iSliceIdx )
+				{			
+					pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
+					iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx );
+					pEncPEncCtx->iPosBsBuffer += iSliceSize;
+				}
+				else
+					iSliceSize = WriteSliceBs( pEncPEncCtx, pSliceBs->pBs, iSliceIdx );
+#else// PACKING_ONE_SLICE_PER_LAYER
+				if ( 0 == iSliceIdx )
+				{
+					pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
+					iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx );
+					pEncPEncCtx->iPosBsBuffer += iSliceSize;
+				}
+				else
+				{
+					pLbi->pBsBuf	= pSliceBs->bs + pSliceBs->uiBsPos;
+					iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx );
+					pSliceBs->uiBsPos += iSliceSize;
+				}			
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+			
+				if ( pCurDq->bDeblockingParallelFlag && pSlice->sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc != 1
+#if !defined(ENABLE_FRAME_DUMP)
+					&& ( eNalRefIdc != NRI_PRI_LOWEST ) && 
+					( pParamD->iHighestTemporalId == 0 || kiCurTid < pParamD->iHighestTemporalId )
+#endif// !ENABLE_FRAME_DUMP
+					)
+				{
+					DeblockingFilterSliceAvcbase( pCurDq, pEncPEncCtx->pFuncList, iSliceIdx );
+				}			
+
+#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+				if ( bDsaFlag )
+				{
+					pEncPEncCtx->pSliceThreading->pSliceConsumeTime[pEncPEncCtx->uiDependencyId][iSliceIdx] = (uint32_t)(WelsTime() - iSliceStart);
+#if defined(ENABLE_TRACE_MT)
+					WelsLog(pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), coding_idx %d, uiSliceIdx %d, pSliceConsumeTime %d, iSliceSize %d, pFirstMbInSlice %d, count_num_mb_in_slice %d\n",
+						pEncPEncCtx->iCodingIndex, iSliceIdx, pEncPEncCtx->pSliceThreading->pSliceConsumeTime[pEncPEncCtx->uiDependencyId][iSliceIdx], iSliceSize, pCurDq->pSliceEncCtx->pFirstMbInSlice[iSliceIdx], pCurDq->pSliceEncCtx->pCountMbNumInSlice[iSliceIdx]);
+#endif//ENABLE_TRACE_MT
+				}
+#endif//DYNAMIC_SLICE_ASSIGN || MT_DEBUG
+
+#if defined(SLICE_INFO_OUTPUT)
+				fprintf(	stderr,
+							"@pSlice=%-6d sliceType:%c idc:%d size:%-6d\n",
+							iSliceIdx,
+							(pEncPEncCtx->eSliceType == P_SLICE ? 'P' : 'I'),
+							eNalRefIdc,
+							iSliceSize
+						);
+#endif//SLICE_INFO_OUTPUT				
+
+#if MT_DEBUG_BS_WR
+				pSliceBs->bSliceCodedFlag	= TRUE;
+#endif//MT_DEBUG_BS_WR
+
+#ifdef WIN32
+				WelsEventSignal( &pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx] );	// mean finished coding current pSlice						
+#else
+				WelsEventSignal( pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx] );	// mean finished coding current pSlice				
+#endif//WIN32				
+			}
+			else	// for SM_DYN_SLICE parallelization
+			{
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+				SLayerBSInfo *pLbiPacking			= NULL;
+#endif//PACKING_ONE_SLICE_PER_LAYER
+				SSliceCtx *pSliceCtx			= pCurDq->pSliceEncCtx;
+				const int32_t kiPartitionId			= iThreadIdx;
+				const int32_t kiSliceIdxStep		= pEncPEncCtx->iActiveThreadsNum;
+				const int32_t kiFirstMbInPartition	= pPrivateData->iStartMbIndex;	// inclusive
+				const int32_t kiEndMbInPartition	= pPrivateData->iEndMbIndex;		// exclusive
+				int32_t iAnyMbLeftInPartition	= kiEndMbInPartition - kiFirstMbInPartition;				
+				
+				iSliceIdx		= pPrivateData->iSliceIndex;
+
+				pSliceCtx->pFirstMbInSlice[iSliceIdx]				= kiFirstMbInPartition;					
+				pCurDq->pNumSliceCodedOfPartition[kiPartitionId]		= 1;	// one pSlice per partition intialized, dynamic slicing inside
+				pCurDq->pLastMbIdxOfPartition[kiPartitionId]			= kiEndMbInPartition-1;
+
+				pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]		= 0;
+
+				while( iAnyMbLeftInPartition > 0 )
+				{
+					if ( iSliceIdx >= pSliceCtx->iMaxSliceNumConstraint )
+					{
+						// TODO: need exception handler for not large enough of MAX_SLICES_NUM related memory usage
+						// No idea about its solution due MAX_SLICES_NUM is fixed lenght in relevent pData structure
+						uiThrdRet	= 1;
+						break;
+					}
+
+					pSlice			= &pCurDq->sLayerInfo.pSliceInLayer[iSliceIdx];
+					pSliceBs		= &pEncPEncCtx->pSliceBs[iSliceIdx];
+			
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+					pSliceBs->uiBsPos	= 0;
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+					pSliceBs->iNalIndex	= 0;
+					InitBits( &pSliceBs->sBsWrite, pSliceBs->pBsBuffer, pSliceBs->uiSize );
+			
+					if ( bNeedPrefix )
+					{
+						if ( eNalRefIdc != NRI_PRI_LOWEST )
+						{
+							WelsLoadNalForSlice( pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc );
+							WelsWriteSVCPrefixNal( &pSliceBs->sBsWrite, eNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == eNalType) );
+							WelsUnloadNalForSlice( pSliceBs );			
+						}
+						else // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension
+						{
+							WelsLoadNalForSlice( pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc );
+							// No need write any syntax of prefix NAL Unit RBSP here
+							WelsUnloadNalForSlice( pSliceBs );			
+						}
+					}
+				
+					WelsLoadNalForSlice( pSliceBs, eNalType, eNalRefIdc );
+
+					WelsCodeOneSlice( pEncPEncCtx, iSliceIdx, eNalType );			
+
+					WelsUnloadNalForSlice( pSliceBs );
+
+#if !defined(PACKING_ONE_SLICE_PER_LAYER)
+					if ( 0 == kiPartitionId )
+					{	
+						if ( 0 == iSliceIdx )
+							pLbi->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
+						iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbi, pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer, iSliceIdx );
+						pEncPEncCtx->iPosBsBuffer += iSliceSize;
+					}
+					else
+						iSliceSize = WriteSliceBs( pEncPEncCtx, pSliceBs->pBs, iSliceIdx );
+#else// PACKING_ONE_SLICE_PER_LAYER
+					pLbiPacking	= pLbi + (iSliceIdx - kiPartitionId);
+
+					if ( 0 == kiPartitionId )
+					{
+						pLbiPacking->pBsBuf	= pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer;
+						iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbiPacking, pLbiPacking->pBsBuf, iSliceIdx );
+						pEncPEncCtx->iPosBsBuffer += iSliceSize;
+					}
+					else
+					{
+						pLbiPacking->pBsBuf	= pSliceBs->bs + pSliceBs->uiBsPos;
+						iSliceSize = WriteSliceToFrameBs( pEncPEncCtx, pLbiPacking, pLbiPacking->pBsBuf, iSliceIdx );
+						pSliceBs->uiBsPos += iSliceSize;
+					}
+					pEncPEncCtx->pSliceThreading->pCountBsSizeInPartition[kiPartitionId] += iSliceSize;
+#endif//!PACKING_ONE_SLICE_PER_LAYER
+			
+					if ( pCurDq->bDeblockingParallelFlag && pSlice->sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc != 1
+#if !defined(ENABLE_FRAME_DUMP)
+						&& ( eNalRefIdc != NRI_PRI_LOWEST ) && 
+						( pParamD->iHighestTemporalId == 0 || kiCurTid < pParamD->iHighestTemporalId )
+#endif// !ENABLE_FRAME_DUMP
+						)
+					{
+						DeblockingFilterSliceAvcbase( pCurDq, pEncPEncCtx->pFuncList, iSliceIdx );
+					}
+					
+#if defined(SLICE_INFO_OUTPUT)
+					fprintf(	stderr,
+								"@pSlice=%-6d sliceType:%c idc:%d size:%-6d\n",
+								iSliceIdx,
+								(pEncPEncCtx->eSliceType == P_SLICE ? 'P' : 'I'),
+								eNalRefIdc,
+								iSliceSize
+							);
+#endif//SLICE_INFO_OUTPUT					
+
+#if defined(ENABLE_TRACE_MT)
+					WelsLog(pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), coding_idx %d, iPartitionId %d, uiSliceIdx %d, iSliceSize %d, count_mb_slice %d, iEndMbInPartition %d, pCurDq->pLastCodedMbIdxOfPartition[%d] %d\n",
+						pEncPEncCtx->iCodingIndex, kiPartitionId, iSliceIdx, iSliceSize, pCurDq->pSliceEncCtx->pCountMbNumInSlice[iSliceIdx], kiEndMbInPartition, kiPartitionId, pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]);
+#endif//ENABLE_TRACE_MT
+					
+					iAnyMbLeftInPartition = kiEndMbInPartition - (1 + pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]);
+					iSliceIdx += kiSliceIdxStep;
+				}
+
+				if ( uiThrdRet )	// any exception??
+					break;
+
+#ifdef WIN32
+				WelsEventSignal( &pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx] );	// mean finished coding current pSlice		
+#else
+				WelsEventSignal( pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx] );	// mean finished coding current pSlice		
+#endif//WIN32
+			}
+		}
+#ifdef WIN32
+		else if ( WELS_THREAD_ERROR_WAIT_OBJECT_0+1 == iWaitRet )	// exit thread signal
+		{
+			uiThrdRet	= 0;
+			break;
+		}
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+		else if ( WELS_THREAD_ERROR_WAIT_OBJECT_0+2 == iWaitRet )	// update pMb list singal
+		{
+			iSliceIdx		= iEventIdx;	// pPrivateData->iSliceIndex; old threads can not be terminated, pPrivateData is not correct for applicable
+			pCurDq			= pEncPEncCtx->pCurDqLayer;
+			UpdateMbListNeighborParallel( pCurDq->pSliceEncCtx, pCurDq->sMbDataP, iSliceIdx );
+			WelsEventSignal( &pEncPEncCtx->pSliceThreading->pFinUpdateMbListEvent[iEventIdx] );	// mean finished update pMb list for this pSlice			
+		}
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#endif//WIN32		
+		else // WELS_THREAD_ERROR_WAIT_TIMEOUT, or WELS_THREAD_ERROR_WAIT_FAILED
+		{
+			WelsLog(pEncPEncCtx, WELS_LOG_WARNING, "[MT] CodingSliceThreadProc(), waiting pReadySliceCodingEvent[%d] failed(%d) and thread%d terminated!\n", iEventIdx, iWaitRet, iThreadIdx);
+			uiThrdRet	= 1;
+			break;
+		}		
+	} while( 1 );
+
+#ifdef WIN32
+	WelsEventSignal( &pEncPEncCtx->pSliceThreading->pFinSliceCodingEvent[iEventIdx] );	// notify to mother encoding threading
+#endif//WIN32
+
+	WELS_THREAD_ROUTINE_RETURN(uiThrdRet);
+}
+
+int32_t CreateSliceThreads( sWelsEncCtx *pCtx )
+{
+	const int32_t kiThreadCount = pCtx->pSvcParam->iCountThreadsNum;
+	int32_t iIdx = 0;
+#if defined(WIN32) && defined(BIND_CPU_CORES_TO_THREADS)
+	DWORD  dwProcessAffinity;
+	DWORD  dwSystemAffinity;
+	GetProcessAffinityMask(GetCurrentProcess(), &dwProcessAffinity, &dwSystemAffinity);
+#endif//WIN32 && BIND_CPU_CORES_TO_THREADS
+	
+	while ( iIdx < kiThreadCount ) {
+		WelsThreadCreate( &pCtx->pSliceThreading->pThreadHandles[iIdx], CodingSliceThreadProc, &pCtx->pSliceThreading->pThreadPEncCtx[iIdx], 0);
+#if defined(WIN32) && defined(BIND_CPU_CORES_TO_THREADS)
+		if ( dwProcessAffinity > 1 && pCtx->pSliceThreading->pThreadHandles[iIdx] != NULL )	// multiple cores and thread created successfully
+		{	
+			DWORD  dw = 0;
+			DWORD  dwAffinityMask = 1 << iIdx;
+			if (dwAffinityMask & dwProcessAffinity) // check if cpu is available
+			{
+				dw = SetThreadAffinityMask( pCtx->pSliceThreading->pThreadHandles[iIdx], dwAffinityMask ); //1 << iIdx
+				if ( dw == 0)
+				{
+					str_t str[64] = {0};
+					SNPRINTF(str, 64, "SetThreadAffinityMask iIdx:%d", iIdx);
+				}
+			}			
+		}
+#endif//WIN32 && BIND_CPU_CORES_TO_THREADS
+		// We need extra threads for update_mb_list_proc on __GNUC__ like OS (mac/linux) 
+		// due to WelsMultipleEventsWaitSingleBlocking implememtation can not work well 
+		// in case waiting pUpdateMbListEvent and pReadySliceCodingEvent events at the same time
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+#if defined(__GNUC__)
+		WelsThreadCreate( &pCtx->pSliceThreading->pUpdateMbListThrdHandles[iIdx], UpdateMbListThreadProc, &pCtx->pSliceThreading->pThreadPEncCtx[iIdx], 0);
+#endif//__GNUC__
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+		++ iIdx;
+	}
+#if defined(ENABLE_TRACE_MT)
+	WelsLog(pCtx, WELS_LOG_INFO, "CreateSliceThreads() exit..\n");
+#endif
+	return 0;
+}
+
+#ifdef PACKING_ONE_SLICE_PER_LAYER
+void ResetCountBsSizeInPartitions( uint32_t *pCountBsSizeList, const int32_t iPartitionCnt )
+{
+	if ( pCountBsSizeList != NULL && iPartitionCnt > 0 )
+	{
+		memset(pCountBsSizeList, 0, sizeof(pCountBsSizeList[0]) * iPartitionCnt );
+	}
+}
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+#ifdef WIN32
+int32_t FiredSliceThreads( SSliceThreadPrivateData *pPriData, WELS_EVENT *pEventsList, SLayerBSInfo *pLbi, const uint32_t uiNumThreads, SSliceCtx *pSliceCtx, const BOOL_T bIsDynamicSlicingMode )
+#else
+int32_t FiredSliceThreads( SSliceThreadPrivateData *pPriData, WELS_EVENT **pEventsList, SLayerBSInfo *pLbi, const uint32_t uiNumThreads, SSliceCtx *pSliceCtx, const BOOL_T bIsDynamicSlicingMode )
+#endif//WIN32
+{
+	int32_t iEndMbIdx	= 0;
+	int32_t iIdx		= 0;
+	const int32_t kiEventCnt = uiNumThreads;
+	
+	if ( pPriData == NULL || pLbi == NULL || kiEventCnt <= 0 || pEventsList == NULL )
+	{
+		WelsLog( NULL, WELS_LOG_ERROR, "FiredSliceThreads(), fail due pPriData == %p || pLbi == %p || iEventCnt(%d) <= 0 || pEventsList == %p!!\n", (void *)pPriData, (void *)pLbi, uiNumThreads,  (void *)pEventsList);	
+		return 1;
+	}
+
+#if defined(PACKING_ONE_SLICE_PER_LAYER)
+	////////////////////////////////////////
+	if ( bIsDynamicSlicingMode )
+	{
+		iEndMbIdx	= pSliceCtx->iMbNumInFrame;
+		for (iIdx = kiEventCnt-1; iIdx >= 0; --iIdx)
+		{
+			const int32_t kiFirstMbIdx		= pSliceCtx->pFirstMbInSlice[iIdx];
+			pPriData[iIdx].iStartMbIndex	= kiFirstMbIdx;			
+			pPriData[iIdx].iEndMbIndex		= iEndMbIdx;
+			iEndMbIdx						= kiFirstMbIdx;
+		}
+	}
+
+	iIdx = 0;
+	while (iIdx < kiEventCnt) {
+		pPriData[iIdx].pLayerBs = pLbi;
+		pPriData[iIdx].iSliceIndex	= iIdx;
+#ifdef WIN32
+		if ( pEventsList[iIdx] )
+			WelsEventSignal( &pEventsList[iIdx] );
+#else
+		WelsEventSignal( pEventsList[iIdx] );
+#endif//WIN32
+		++ pLbi;
+		++ iIdx;
+	}	
+	////////////////////////////////////////
+#else
+	////////////////////////////////////////
+	if ( bIsDynamicSlicingMode )
+	{
+		iEndMbIdx	= pSliceCtx->iMbNumInFrame;
+		for (iIdx = kiEventCnt-1; iIdx >= 0; --iIdx)
+		{
+			const int32_t iFirstMbIdx		= pSliceCtx->pFirstMbInSlice[iIdx];
+			pPriData[iIdx].iStartMbIndex	= iFirstMbIdx;			
+			pPriData[iIdx].iEndMbIndex		= iEndMbIdx;
+			iEndMbIdx						= iFirstMbIdx;
+		}
+	}
+
+	iIdx = 0;
+	while (iIdx < kiEventCnt) {
+		pPriData[iIdx].pLayerBs = pLbi;
+		pPriData[iIdx].iSliceIndex	= iIdx;
+#ifdef WIN32
+		if ( pEventsList[iIdx] )
+			WelsEventSignal( &pEventsList[iIdx] );
+#else
+		WelsEventSignal( pEventsList[iIdx] );
+#endif//WIN32
+		++ iIdx;
+	}		
+	////////////////////////////////////////
+#endif//PACKING_ONE_SLICE_PER_LAYER
+
+	return 0;
+}
+
+int32_t DynamicDetectCpuCores()
+{
+	WelsLogicalProcessInfo  info;
+	WelsQueryLogicalProcessInfo(&info);
+	return info.ProcessorCount;
+}
+
+#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
+
+int32_t AdjustBaseLayer( sWelsEncCtx *pCtx )
+{
+	SDqLayer *pCurDq	= pCtx->ppDqLayerList[0];		
+	int32_t iNeedAdj	= 1;
+#ifdef MT_DEBUG
+	int64_t iT0 = WelsTime();
+#endif//MT_DEBUG
+#ifdef TRY_SLICING_BALANCE
+	
+	pCtx->pCurDqLayer	= pCurDq;
+	
+#ifdef NOT_ABSOLUTE_BALANCING
+	// do not need adjust due to not different at both slices of consumed time
+	iNeedAdj	= NeedDynamicAdjust( pCtx->pSliceThreading->pSliceConsumeTime[0], pCurDq->pSliceEncCtx->iSliceNumInFrame );
+	if ( iNeedAdj )
+#endif//NOT_ABSOLUTE_BALANCING
+	DynamicAdjustSlicing(	pCtx,
+							pCurDq,
+							pCtx->pSliceThreading->pSliceComplexRatio[0],
+							0 );
+#endif//TRY_SLICING_BALANCE
+#ifdef MT_DEBUG
+	iT0 = WelsTime() - iT0;
+	if ( pCtx->pSliceThreading->pFSliceDiff )
+	{
+		fprintf( pCtx->pSliceThreading->pFSliceDiff, 
+#ifdef WIN32
+				"%6I64d us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n",
+#else
+				"%6lld us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n",
+#endif//WIN32
+				iT0, iNeedAdj );
+	}
+#endif//MT_DEBUG
+
+	return iNeedAdj;
+}
+
+int32_t AdjustEnhanceLayer( sWelsEncCtx *pCtx, int32_t iCurDid )
+{
+#ifdef MT_DEBUG
+	int64_t iT1 = WelsTime();
+#endif//MT_DEBUG
+	int32_t iNeedAdj = 1;	
+	// uiSliceMode of referencing spatial should be SM_FIXEDSLCNUM_SLICE
+	// if using spatial base layer for complexity estimation
+	const BOOL_T kbModelingFromSpatial =	(pCtx->pCurDqLayer->pRefLayer != NULL && iCurDid > 0) 
+										&& (pCtx->pSvcParam->sDependencyLayers[iCurDid-1].sMso.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pCtx->pSvcParam->iMultipleThreadIdc >= pCtx->pSvcParam->sDependencyLayers[iCurDid-1].sMso.sSliceArgument.iSliceNum);
+
+	if ( kbModelingFromSpatial )	// using spatial base layer for complexity estimation
+	{	
+#ifdef TRY_SLICING_BALANCE
+#ifdef NOT_ABSOLUTE_BALANCING
+		// do not need adjust due to not different at both slices of consumed time
+		iNeedAdj = NeedDynamicAdjust( pCtx->pSliceThreading->pSliceConsumeTime[iCurDid-1], pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame );
+		if ( iNeedAdj )
+#endif//NOT_ABSOLUTE_BALANCING
+		DynamicAdjustSlicing(	pCtx,
+								pCtx->pCurDqLayer,
+								pCtx->pSliceThreading->pSliceComplexRatio[iCurDid-1],
+								iCurDid
+							  );
+#endif//TRY_SLICING_BALANCE
+	}
+	else	// use temporal layer for complexity estimation
+	{	
+#ifdef TRY_SLICING_BALANCE
+#ifdef NOT_ABSOLUTE_BALANCING
+		// do not need adjust due to not different at both slices of consumed time
+		iNeedAdj = NeedDynamicAdjust( pCtx->pSliceThreading->pSliceConsumeTime[iCurDid], pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame );
+		if ( iNeedAdj )
+#endif//NOT_ABSOLUTE_BALANCING
+		DynamicAdjustSlicing(	pCtx,
+								pCtx->pCurDqLayer,
+								pCtx->pSliceThreading->pSliceComplexRatio[iCurDid],
+								iCurDid
+							  );
+#endif//TRY_SLICING_BALANCE
+	}
+
+#ifdef MT_DEBUG
+	iT1 = WelsTime() - iT1;
+	if ( pCtx->pSliceThreading->pFSliceDiff )
+	{
+		fprintf( pCtx->pSliceThreading->pFSliceDiff, 
+#ifdef WIN32
+				"%6I64d us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n",
+#else
+				"%6lld us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n",
+#endif//WIN32
+				iT1, iCurDid, iNeedAdj );
+	}
+#endif//MT_DEBUG
+
+	return iNeedAdj;
+}
+
+#endif//#if defined(MT_ENABLED) && defined(DYNAMIC_SLICE_ASSIGN)
+
+#if defined(MT_ENABLED)
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE) && defined(MT_DEBUG)
+void TrackSliceComplexities( sWelsEncCtx *pCtx, const int32_t iCurDid )
+{
+	const int32_t kiCountSliceNum = pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame;
+	if ( kiCountSliceNum > 0 )
+	{
+		int32_t iSliceIdx = 0;
+		do {
+			fprintf( pCtx->pSliceThreading->pFSliceDiff, "%6.3f complexity pRatio at iDid %d pSlice %d\n", pCtx->pSliceThreading->pSliceComplexRatio[iCurDid][iSliceIdx], iCurDid, iSliceIdx );
+			++ iSliceIdx;
+		} while(iSliceIdx < kiCountSliceNum);
+	}
+}
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) && defined(TRY_SLICING_BALANCE)
+
+#if defined(DYNAMIC_SLICE_ASSIGN) && defined(MT_DEBUG)
+void TrackSliceConsumeTime( sWelsEncCtx *pCtx, int32_t *pDidList, const int32_t iSpatialNum )
+{
+	SWelsSvcCodingParam *pPara = NULL;
+	int32_t iSpatialIdx = 0;
+
+	if ( iSpatialNum > MAX_DEPENDENCY_LAYER )
+		return;
+
+	pPara	= pCtx->pSvcParam;
+	while ( iSpatialIdx < iSpatialNum )
+	{
+		const int32_t kiDid		= pDidList[iSpatialIdx];
+		SDLayerParam *pDlp		= &pPara->sDependencyLayers[kiDid];
+		SMulSliceOption *pMso	= &pDlp->sMso;
+		SDqLayer *pCurDq		= pCtx->ppDqLayerList[kiDid];
+		SSliceCtx *pSliceCtx= pCurDq->pSliceEncCtx;
+		const uint32_t kuiCountSliceNum = pSliceCtx->iSliceNumInFrame;
+		if(pCtx->pSliceThreading)
+		{
+			if ( pCtx->pSliceThreading->pFSliceDiff && pMso->uiSliceMode == SM_FIXEDSLCNUM_SLICE && pPara->iMultipleThreadIdc > 1 && pPara->iMultipleThreadIdc >= kuiCountSliceNum  )
+			{
+				uint32_t i = 0;
+				uint32_t uiMaxT = 0;
+				int32_t iMaxI = 0;
+				while (i < kuiCountSliceNum) {
+					if ( pCtx->pSliceThreading->pSliceConsumeTime[kiDid] != NULL )
+						fprintf( pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time coding_idx %d iDid %d pSlice %d\n",
+						pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i], pCtx->iCodingIndex, kiDid, i /*/ 1000*/);
+					if (pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i] > uiMaxT)
+					{
+						uiMaxT = pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i];
+						iMaxI = i;
+					}
+					++ i;
+				}			 
+			fprintf( pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time_max coding_idx %d iDid %d pSlice %d\n", uiMaxT, pCtx->iCodingIndex, kiDid, iMaxI /*/ 1000*/);
+			}
+		}
+		++ iSpatialIdx;
+	}
+}
+#endif//#if defined(DYNAMIC_SLICE_ASSIGN) || defined(MT_DEBUG)
+
+#endif//MT_ENABLED
+}
+#endif//MT_ENABLED
+
--- /dev/null
+++ b/codec/encoder/core/src/svc_base_layer_md.cpp
@@ -1,0 +1,1985 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_base_layer_md.c
+ *
+ * \brief	mode decision
+ *
+ * \date	2009.08.10 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <assert.h>
+#include "ls_defines.h"
+#include "encoder_context.h"
+#include "svc_enc_slice_segment.h"
+#include "md.h"
+#include "mc.h"
+#include "mv_pred.h"
+#include "cpu_core.h"
+#include "svc_enc_golomb.h"
+#include "svc_base_layer_md.h"
+#include "sample.h"
+#include "encoder.h"
+#include "svc_encode_mb.h"
+#include "svc_encode_slice.h"
+#include "svc_motion_estimate.h"
+#include "as264_common.h"
+#include "encode_mb_aux.h"
+#include "utils.h"
+namespace WelsSVCEnc {
+static const ALIGNED_DECLARE(int8_t, g_kiIntra16AvaliMode[8][5], 16) = {
+    	{ I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
+		{ I16_PRED_DC_L,   I16_PRED_H,       I16_PRED_INVALID, I16_PRED_INVALID, 2 },
+		{ I16_PRED_DC_T,   I16_PRED_V,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
+   		{ I16_PRED_V,      I16_PRED_H,		 I16_PRED_DC,	   I16_PRED_INVALID, 3 },
+   		{ I16_PRED_DC_128, I16_PRED_INVALID, I16_PRED_INVALID, I16_PRED_INVALID, 1 },
+   		{ I16_PRED_DC_L,   I16_PRED_H,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
+   		{ I16_PRED_DC_T,   I16_PRED_V,		 I16_PRED_INVALID, I16_PRED_INVALID, 2 },
+   		{ I16_PRED_V,      I16_PRED_H,		 I16_PRED_DC,	   I16_PRED_P,       4 }
+};
+
+static const ALIGNED_DECLARE(uint8_t, g_kiIntra4AvailCount[16], 16) = {
+#ifndef  I4_PRED_MODE_EXTEND
+	1,3,2,4,1,3,2,7,1,3,4,6,1,3,4,9
+#else
+	1,3,4,4,1,3,4,7,1,3,4,6,1,3,4,9
+#endif  //I4_PRED_MODE_EXTEND
+};
+
+//left_avail | (top_avail<<1) | (left_top_avail<<2) | (right_top_avail<<3);
+static const ALIGNED_DECLARE(uint8_t, g_kiIntra4AvailMode[16][16], 16) = {
+	{
+	I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  0000
+
+	{ 
+	I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  0001
+
+#ifndef  I4_PRED_MODE_EXTEND
+	{ 
+	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID}, //  0010
+#else  
+	{ 
+	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL_TOP, I4_PRED_VL_TOP, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID}, //  0010
+#endif //I4_PRED_MODE_EXTEND
+
+	{ 
+	I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID}, //  0011
+
+	{ 
+	I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,	
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  0100
+
+	{ 
+	I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},    //  0101
+
+#ifndef  I4_PRED_MODE_EXTEND
+	{ 
+	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},     //  0110
+#else 
+	{ I4_PRED_DC_T,  I4_PRED_V,       I4_PRED_DDL_TOP, I4_PRED_VL_TOP, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},     //  0110
+#endif //I4_PRED_MODE_EXTEND
+
+	{ 
+	I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU,
+	I4_PRED_DDR,     I4_PRED_VR,      I4_PRED_HD,      I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},               //  0111
+
+	{ 
+	I4_PRED_DC_128,   I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  1000
+
+	{ 
+	I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},    //  1001
+
+	{ 
+	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL,     I4_PRED_VL, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},     //  1010
+
+	{ 
+	I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU, 
+	I4_PRED_DDL,     I4_PRED_VL,      I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},          //  1011 
+
+	{ 
+	I4_PRED_DC_128,  I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID,
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},  //  1100
+
+	{ 
+	I4_PRED_DC_L,    I4_PRED_H,       I4_PRED_HU,      I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},    //  1101
+
+	{ 
+	I4_PRED_DC_T,    I4_PRED_V,       I4_PRED_DDL,     I4_PRED_VL, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID},     //  1110
+
+	{ 
+	I4_PRED_DC,      I4_PRED_H,       I4_PRED_V,       I4_PRED_HU, 
+	I4_PRED_DDL,     I4_PRED_VL,      I4_PRED_DDR,     I4_PRED_VR, 
+	I4_PRED_HD,      I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, 
+	I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID, I4_PRED_INVALID}                          //  1111 
+
+};
+static const ALIGNED_DECLARE(int8_t, g_kiIntraChromaAvailMode[8][5], 16) = {
+		{ C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
+		{ C_PRED_DC_L,   C_PRED_H,       C_PRED_INVALID, C_PRED_INVALID, 2 },
+		{ C_PRED_DC_T,   C_PRED_V,       C_PRED_INVALID, C_PRED_INVALID, 2 },
+		{ C_PRED_V,      C_PRED_H,       C_PRED_DC,      C_PRED_INVALID, 3 },
+		{ C_PRED_DC_128, C_PRED_INVALID, C_PRED_INVALID, C_PRED_INVALID, 1 },
+		{ C_PRED_DC_L,   C_PRED_H,       C_PRED_INVALID, C_PRED_INVALID, 2 },
+		{ C_PRED_DC_T,   C_PRED_V,       C_PRED_INVALID, C_PRED_INVALID, 2 },
+		{ C_PRED_V,      C_PRED_H,       C_PRED_DC,      C_PRED_P,       4 }		
+};
+
+// for cache hit, two table are total sizeof 64 Bytes
+const int8_t g_kiCoordinateIdx4x4X[16] = { 0, 4, 0, 4,
+										  8, 12, 8, 12,
+										  0, 4, 0, 4,
+										  8, 12, 8, 12};
+
+const int8_t g_kiCoordinateIdx4x4Y[16] = { 0, 0, 4, 4,
+										  0, 0, 4, 4,
+										  8, 8, 12, 12,
+										  8, 8, 12, 12};
+static const ALIGNED_DECLARE(int8_t, g_kiNeighborIntraToI4x4[16][16], 16) = 
+{
+	{	0,	1,	10,	7,	1,	1,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+	{	1,	1,	15,	7,	1,	1,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+	{	10,	15,	10,	7,	15,	7,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+	{	11,	15,	15,	7,	15,	7,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+	{	4,	1,	10,	7,	1,	1,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+	{	5,	1,	15,	7,	1,	1,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+	{	14,	15,	10,	7,	15,	7,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+	{	15,	15,	15,	7,	15,	7,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+	{	0,	1,	10,	7,	1,	9,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+	{	1,	1,	15,	7,	1,	9,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+	{	10,	15,	10,	7,	15,	15,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+	{	11,	15,	15,	7,	15,	15,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+	{	4,	1,	10,	7,	1,	9,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+	{	5,	1,	15,	7,	1,	9,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+	{	14,	15,	10,	7,	15,	15,	15,	7,	10,	15,	10,	7,	15,	7,	15,	7},
+	{	15,	15,	15,	7,	15,	15,	15,	7,	15,	15,	15,	7,	15,	7,	15,	7},
+};
+	
+__align16(const int8_t,g_kiMapModeI4x4[14]) =
+{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 3, 7
+};
+	
+int32_t PredIntra4x4Mode(int8_t* pIntraPredMode, int32_t iIdx4)
+{
+	int8_t iTopMode = pIntraPredMode[iIdx4 - 8];
+	int8_t iLeftMode = pIntraPredMode[iIdx4 - 1];
+	int8_t iBestMode;
+
+	if (-1 == iLeftMode || -1 == iTopMode)
+	{
+		iBestMode = 2;
+	}
+	else
+	{	
+		iBestMode = WELS_MIN(iLeftMode, iTopMode);
+	}
+	return iBestMode;
+}
+
+void WelsMdIntraInit(sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache *pMbCache, const int32_t iSliceFirstMbXY)
+{	
+	SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;	
+
+	const int32_t kiMbX  = pCurMb->iMbX;
+	const int32_t kiMbY  = pCurMb->iMbY;
+	const int32_t kiMbXY = pCurMb->iMbXY;
+
+	// step 3. locating current pEnc and pDec
+	// unroll loops here
+	if( 0 == kiMbX || iSliceFirstMbXY == kiMbXY )
+	{   
+		int32_t iStrideY, iStrideUV;
+		int32_t iOffsetY, iOffsetUV;
+
+		iStrideY	= pCurLayer->iEncStride[0];
+		iStrideUV	= pCurLayer->iEncStride[1];
+		iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
+		iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
+		pMbCache->SPicData.pEncMb[0]		= pCurLayer->pEncData[0] + iOffsetY;
+		pMbCache->SPicData.pEncMb[1]		= pCurLayer->pEncData[1] + iOffsetUV;
+		pMbCache->SPicData.pEncMb[2]		= pCurLayer->pEncData[2] + iOffsetUV;
+
+		iStrideY	= pCurLayer->iCsStride[0];
+		iStrideUV	= pCurLayer->iCsStride[1];
+		iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
+		iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
+		pMbCache->SPicData.pCsMb[0]			= pCurLayer->pCsData[0] + iOffsetY;
+		pMbCache->SPicData.pCsMb[1]			= pCurLayer->pCsData[1] + iOffsetUV;
+		pMbCache->SPicData.pCsMb[2]			= pCurLayer->pCsData[2] + iOffsetUV;
+
+		iStrideY	= pCurLayer->pDecPic->iLineSize[0];
+		iStrideUV	= pCurLayer->pDecPic->iLineSize[1];
+		iOffsetY	= (kiMbX + kiMbY * iStrideY) << 4;
+		iOffsetUV	= (kiMbX + kiMbY * iStrideUV) << 3;
+		pMbCache->SPicData.pDecMb[0]		= pCurLayer->pDecPic->pData[0] + iOffsetY;
+		pMbCache->SPicData.pDecMb[1]		= pCurLayer->pDecPic->pData[1] + iOffsetUV;
+		pMbCache->SPicData.pDecMb[2]		= pCurLayer->pDecPic->pData[2] + iOffsetUV;
+	}
+	else
+	{
+		pMbCache->SPicData.pEncMb[0]	+= MB_WIDTH_LUMA;
+		pMbCache->SPicData.pEncMb[1]	+= MB_WIDTH_CHROMA;
+		pMbCache->SPicData.pEncMb[2]	+= MB_WIDTH_CHROMA;
+
+		pMbCache->SPicData.pDecMb[0]	+= MB_WIDTH_LUMA;
+		pMbCache->SPicData.pDecMb[1]	+= MB_WIDTH_CHROMA;
+		pMbCache->SPicData.pDecMb[2]	+= MB_WIDTH_CHROMA;
+
+		pMbCache->SPicData.pCsMb[0]		+= MB_WIDTH_LUMA;
+		pMbCache->SPicData.pCsMb[1]		+= MB_WIDTH_CHROMA;
+		pMbCache->SPicData.pCsMb[2]		+= MB_WIDTH_CHROMA;
+	}
+
+	//step 2. initial pWelsMd
+	pCurMb->uiCbp			= 0;
+   
+	//step 4: locating scaled_tcoeff
+
+	//step 1. load neighbor cache
+	FillNeighborCacheIntra(pMbCache, pCurMb, pCurLayer->iMbWidth);
+	pMbCache->pMemPredLuma = pMbCache->pMemPredMb;// in WelsMdI16x16() will be changed, so re-init here!
+	pMbCache->pMemPredChroma = pMbCache->pMemPredMb + 256;// Init with default, maybe change in WelsMdI16x16 and svc_md_i16x16_sad
+}
+
+void WelsMdInterInit( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, const int32_t iSliceFirstMbXY )
+{	
+	SDqLayer* pCurLayer = pEncCtx->pCurDqLayer;
+	SMbCache *pMbCache	= &pSlice->sMbCacheInfo;	
+	const int32_t kiMbX  = pCurMb->iMbX;
+	const int32_t kiMbY  = pCurMb->iMbY;
+	const int32_t kiMbXY	= pCurMb->iMbXY;
+	const int32_t kiMbWidth = pCurLayer->iMbWidth;
+	const int32_t kiMbHeight= pCurLayer->iMbHeight;
+
+	pMbCache->pEncSad		= &pCurLayer->pDecPic->pMbSkipSad[kiMbXY]; 
+
+	//step 1. load neighbor cache
+	pEncCtx->pFuncList->pfFillInterNeighborCache(pMbCache, pCurMb, kiMbWidth, pEncCtx->pVaa->pVaaBackgroundMbFlag+kiMbXY); //BGD spatial pFunc   
+	
+	//step 3: initial cost
+
+	//step 4. locating current p_ref
+	// merge loops
+	if( 0 == kiMbX || iSliceFirstMbXY == kiMbXY )
+	{   
+		const int32_t kiRefStrideY	= pCurLayer->pRefPic->iLineSize[0];
+		const int32_t kiRefStrideUV	= pCurLayer->pRefPic->iLineSize[1];
+		const int32_t kiCurStrideY	= (kiMbX + kiMbY * kiRefStrideY) << 4;
+		const int32_t kiCurStrideUV	= (kiMbX + kiMbY * kiRefStrideUV) << 3;	
+		pMbCache->SPicData.pRefMb[0]	= pCurLayer->pRefPic->pData[0] + kiCurStrideY;
+		pMbCache->SPicData.pRefMb[1]	= pCurLayer->pRefPic->pData[1] + kiCurStrideUV;
+		pMbCache->SPicData.pRefMb[2]	= pCurLayer->pRefPic->pData[2] + kiCurStrideUV;
+	}
+	else
+	{
+		pMbCache->SPicData.pRefMb[0]	+= MB_WIDTH_LUMA;
+		pMbCache->SPicData.pRefMb[1]	+= MB_WIDTH_CHROMA;
+		pMbCache->SPicData.pRefMb[2]	+= MB_WIDTH_CHROMA;
+	}
+
+	pMbCache->uiRefMbType	= pCurLayer->pRefPic->uiRefMbType[kiMbXY];
+	pMbCache->bCollocatedPredFlag = false;
+
+	//comment: sometimes, mode decision process may skip the md_p16x16 and md_pskip function,
+	ST32(&pCurMb->sP16x16Mv, 0);
+	ST32(&pCurLayer->pDecPic->sMvList[kiMbXY], 0);
+
+	pSlice->sMvMin.iMvX = -16*( kiMbX + 1 ) + INTPEL_NEEDED_MARGIN;
+	if ( pSlice->sMvMin.iMvX < -MV_RANGE )
+		pSlice->sMvMin.iMvX = -MV_RANGE;
+	pSlice->sMvMin.iMvY = -16*( kiMbY + 1 ) + INTPEL_NEEDED_MARGIN;
+	if (pSlice->sMvMin.iMvY < -MV_RANGE)
+		pSlice->sMvMin.iMvY = -MV_RANGE;
+	pSlice->sMvMax.iMvX = 16*( kiMbWidth - kiMbX ) - INTPEL_NEEDED_MARGIN;
+	if (pSlice->sMvMax.iMvX > MV_RANGE)
+		pSlice->sMvMax.iMvX = MV_RANGE;
+	pSlice->sMvMax.iMvY = 16*( kiMbHeight - kiMbY ) - INTPEL_NEEDED_MARGIN;
+	if (pSlice->sMvMax.iMvY > MV_RANGE)
+		pSlice->sMvMax.iMvY = MV_RANGE;
+}
+
+int32_t WelsMdI16x16(SWelsFuncPtrList *pFunc, SDqLayer *pCurDqLayer, SMbCache *pMbCache, int32_t iLambda)
+{
+	const int8_t  *kpAvailMode;
+	int32_t iAvailCount;	
+	int32_t iIdx = 0;
+	uint8_t* pPredI16x16[2] = {pMbCache->pMemPredMb, pMbCache->pMemPredMb + 256};
+	uint8_t *pDst		= pPredI16x16[0];
+	uint8_t *pDec       = pMbCache->SPicData.pCsMb[0];
+	uint8_t *pEnc       = pMbCache->SPicData.pEncMb[0];
+	int32_t iLineSizeDec = pCurDqLayer->iCsStride[0];
+	int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
+	int32_t i, iCurCost, iCurMode, iBestMode, iBestCost = INT_MAX;
+
+	int32_t iOffset = pMbCache->uiNeighborIntra&0x07;
+	iAvailCount = g_kiIntra16AvaliMode[iOffset][4];
+	kpAvailMode = g_kiIntra16AvaliMode[iOffset];
+	if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra16x16Combined3 )
+	{
+		iBestCost = pFunc->sSampleDealingFuncs.pfIntra16x16Combined3(pDec, iLineSizeDec, pEnc,iLineSizeEnc,&iBestMode, iLambda, pDst/*temp*/);
+		iCurMode = kpAvailMode[3];
+		pFunc->pfGetLumaI16x16Pred[iCurMode](pDst, pDec, iLineSizeDec);
+		iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16](pDst, 16, pEnc, iLineSizeEnc) + iLambda * 4 ;
+		if (iCurCost < iBestCost)
+		{
+			iBestMode = iCurMode;
+			iBestCost = iCurCost;
+		}
+		else
+		{
+			pFunc->pfGetLumaI16x16Pred[iBestMode](pDst, pDec, iLineSizeDec);
+		}
+		iIdx = 1;
+		iBestCost += iLambda;
+	}
+	else
+	{
+		iBestMode = kpAvailMode[0];
+		for ( i = 0; i < iAvailCount; ++ i)
+		{
+			iCurMode = kpAvailMode[i];
+
+			assert( iCurMode >= 0 && iCurMode < 7 );
+
+			pFunc->pfGetLumaI16x16Pred[iCurMode](pDst, pDec, iLineSizeDec);
+			iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_16x16](pDst, 16, pEnc, iLineSizeEnc);
+			iCurCost += iLambda * (BsSizeUE(g_kiMapModeI16x16[iCurMode]));
+			if (iCurCost < iBestCost)
+			{
+				iBestMode = iCurMode;
+				iBestCost = iCurCost;
+				iIdx = iIdx ^ 0x01;
+				pDst = pPredI16x16[iIdx];
+			}
+		}
+	}
+	pMbCache->pMemPredChroma = pPredI16x16[iIdx];
+	
+	pMbCache->pMemPredLuma = pPredI16x16[iIdx ^ 0x01];
+	pMbCache->uiLumaI16x16Mode  = iBestMode;
+	return iBestCost;
+}
+int32_t WelsMdI4x4(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
+{
+	sWelsEncCtx *pEncCtx	= (sWelsEncCtx*)pEnc;
+	SWelsFuncPtrList *pFunc		= pEncCtx->pFuncList;
+	SWelsMD* pWelsMd					= (SWelsMD*)pMd;
+	SDqLayer *pCurDqLayer			= pEncCtx->pCurDqLayer;
+	int32_t iLambda				= pWelsMd->iLambda;
+	int32_t iBestCostLuma				= pWelsMd->iCostLuma;
+	uint8_t *pEncMb					= pMbCache->SPicData.pEncMb[0];
+	uint8_t *pDecMb					= pMbCache->SPicData.pCsMb[0];
+	const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[0];
+	const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[0];
+
+	uint8_t *pCurEnc, *pCurDec, *pDst;
+	
+	int32_t iPredMode, iCurMode, iBestMode, iFinalMode;
+	int32_t iCurCost, iBestCost;
+	int32_t iAvailCount;
+	const uint8_t *kpAvailMode;
+	int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
+	int32_t lambda[2]						= {iLambda<<2, iLambda};
+	bool_t *pPrevIntra4x4PredModeFlag	= pMbCache->pPrevIntra4x4PredModeFlag;
+	int8_t *pRemIntra4x4PredModeFlag		= pMbCache->pRemIntra4x4PredModeFlag;
+	const uint8_t* kpIntra4x4AvailCount		= &g_kiIntra4AvailCount[0];
+	const uint8_t* kpCache48CountScan4		= &g_kuiCache48CountScan4Idx[0];
+	const int8_t* kpNeighborIntraToI4x4	= g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];
+	const int8_t* kpCoordinateIdxX			= &g_kiCoordinateIdx4x4X[0];
+	const int8_t* kpCoordinateIdxY			= &g_kiCoordinateIdx4x4Y[0];
+    int32_t iBestPredBufferNum			= 0;
+    int32_t iCosti4x4						= 0;
+	
+#if defined(X86_ASM)	
+	WelsPrefetchZero_mmx(g_kiMapModeI4x4);
+	WelsPrefetchZero_mmx((int8_t *)&pFunc->pfGetLumaI4x4Pred);
+#endif//X86_ASM
+
+	for (i = 0; i < 16; i++) 
+	{
+		const int32_t kiOffset	= kpNeighborIntraToI4x4[i];
+
+		//step 1: locating current 4x4 block position in pEnc and pDecMb
+		iCoordinateX = kpCoordinateIdxX[i];
+		iCoordinateY = kpCoordinateIdxY[i];
+		
+		iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
+		pCurEnc = pEncMb + iIdxStrideEnc;
+		iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
+		pCurDec = pDecMb + iIdxStrideDec;
+
+		//step 2: get predicted mode from neighbor
+		iPredMode = PredIntra4x4Mode(pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
+
+		//step 3: collect candidates of iPredMode		
+		iAvailCount = kpIntra4x4AvailCount[kiOffset];
+		kpAvailMode = g_kiIntra4AvailMode[kiOffset];
+
+		//step 4: gain the best pred mode        
+		iBestCost = INT_MAX;
+		iBestMode = kpAvailMode[0];
+		
+		if (pFunc->sSampleDealingFuncs.pfIntra4x4Combined3Satd && (iAvailCount >= 6))
+		{
+			pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
+
+			iBestCost = pFunc->sSampleDealingFuncs.pfIntra4x4Combined3(pCurDec, kiLineSizeDec, pCurEnc, kiLineSizeEnc, pDst, &iBestMode, 
+				lambda[iPredMode == 2], lambda[iPredMode == 1], lambda[iPredMode == 0]);
+               //     ST64(&pMbCache->pMemPredBlk4[iBestMode<<4], LD64(mem_pred_blk4_temp));
+               //     ST64(&pMbCache->pMemPredBlk4[8+(iBestMode<<4)], LD64(mem_pred_blk4_temp+8));
+
+        		for (j = 3; j < iAvailCount; ++ j)
+        		{
+        			iCurMode = kpAvailMode[j];
+        			
+        			assert( iCurMode >= 0 && iCurMode < 14 );
+
+        			pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+        			pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec);
+        			iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+        				lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+        			
+        			if (iCurCost < iBestCost)
+        			{			
+        				iBestMode = iCurMode;
+        				iBestCost = iCurCost;
+						iBestPredBufferNum = 1 - iBestPredBufferNum;
+        			}
+        		}
+            }
+		else
+		{
+			for (j = 0; j < iAvailCount; ++ j)
+			{
+				iCurMode = kpAvailMode[j];
+				
+				assert( iCurMode >= 0 && iCurMode < 14 );
+
+				pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+				pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec);
+				iCurCost = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+					lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+				
+				if (iCurCost < iBestCost)
+				{			
+					iBestMode = iCurMode;
+					iBestCost = iCurCost;
+					iBestPredBufferNum = 1 - iBestPredBufferNum;
+				}
+			}
+		}
+		pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
+		iCosti4x4 += iBestCost;
+		if (iCosti4x4 >= iBestCostLuma)
+		{
+			break;
+		}
+		
+		//step 5: update pred mode and sample avail cache
+		iFinalMode = g_kiMapModeI4x4[iBestMode];
+		if (iPredMode == iFinalMode)
+		{
+			*pPrevIntra4x4PredModeFlag++ = true;
+		}
+		else
+		{
+			*pPrevIntra4x4PredModeFlag++ = false;
+			*pRemIntra4x4PredModeFlag  = (iFinalMode < iPredMode ? iFinalMode: (iFinalMode-1));						
+		}
+		pRemIntra4x4PredModeFlag++;
+	//	pCurMb->pIntra4x4PredMode[g_kuiMbCountScan4Idx[i]] = iFinalMode;
+		pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
+
+		//step 6: encoding I_4x4 
+		WelsEncRecI4x4Y(pEncCtx, pCurMb, pMbCache, i);
+	}
+	ST32(pCurMb->pIntra4x4PredMode, LD32(&pMbCache->iIntraPredMode[33]));
+	pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
+	pCurMb->pIntra4x4PredMode[5] =	pMbCache->iIntraPredMode[20];
+	pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
+	iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0
+	return iCosti4x4;
+}
+
+int32_t WelsMdI4x4Fast(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
+{
+	sWelsEncCtx* pEncCtx	= (sWelsEncCtx*)pEnc;	
+	SWelsFuncPtrList *pFunc		= pEncCtx->pFuncList;
+	SWelsMD *pWelsMd					= (SWelsMD*)pMd;
+	SDqLayer *pCurDqLayer			= pEncCtx->pCurDqLayer;	
+	int32_t iLambda				= pWelsMd->iLambda;
+	int32_t iBestCostLuma				= pWelsMd->iCostLuma;
+	uint8_t *pEncMb					= pMbCache->SPicData.pEncMb[0];
+	uint8_t *pDecMb					= pMbCache->SPicData.pCsMb[0];
+	const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[0];
+	const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[0];
+
+	uint8_t* pCurEnc, *pCurDec, *pDst;
+	int8_t iPredMode, iCurMode, iBestMode, iFinalMode;
+	int32_t iCurCost, iBestCost;
+	int32_t iAvailCount;
+	const uint8_t * kpAvailMode;
+	int32_t i, j, iCoordinateX, iCoordinateY, iIdxStrideEnc, iIdxStrideDec;
+	int32_t iCostH, iCostV, iCostVR, iCostHD, iCostVL, iCostHU, iBestModeFake;
+	int32_t lambda[2]						= {iLambda<<2, iLambda};
+	bool_t* pPrevIntra4x4PredModeFlag	= pMbCache->pPrevIntra4x4PredModeFlag;
+	int8_t* pRemIntra4x4PredModeFlag		= pMbCache->pRemIntra4x4PredModeFlag;	
+	const uint8_t* kpIntra4x4AvailCount		= &g_kiIntra4AvailCount[0];
+	const uint8_t* kpCache48CountScan4		= &g_kuiCache48CountScan4Idx[0];
+	const int8_t* kpNeighborIntraToI4x4	= g_kiNeighborIntraToI4x4[pMbCache->uiNeighborIntra];	
+	const int8_t* kpCoordinateIdxX			= &g_kiCoordinateIdx4x4X[0];
+	const int8_t* kpCoordinateIdxY			= &g_kiCoordinateIdx4x4Y[0];
+	int32_t iBestPredBufferNum			= 0;
+	int32_t iCosti4x4						= 0;
+#if defined(X86_ASM)
+	WelsPrefetchZero_mmx(g_kiMapModeI4x4);
+	WelsPrefetchZero_mmx((int8_t *)&pFunc->pfGetLumaI4x4Pred);
+#endif//X86_ASM
+
+	for (i = 0; i < 16; i++) 
+	{
+		const int32_t kiOffset	= kpNeighborIntraToI4x4[i];
+//		const int32_t i_next	= (1+i) & 15;												// next loop
+//		const uint8_t dummy_byte= pIntra4x4AvailCount[pNeighborIntraToI4x4[i_next]];	// prefetch pIntra4x4AvailCount of next loop to avoid cache missed
+		
+		//step 1: locating current 4x4 block position in pEnc and pDecMb
+		iCoordinateX = kpCoordinateIdxX[i];
+		iCoordinateY = kpCoordinateIdxY[i];		
+
+		iIdxStrideEnc = (iCoordinateY * kiLineSizeEnc) + iCoordinateX;
+		pCurEnc = pEncMb + iIdxStrideEnc;
+		iIdxStrideDec = (iCoordinateY * kiLineSizeDec) + iCoordinateX;
+		pCurDec = pDecMb + iIdxStrideDec;
+
+		//step 2: get predicted mode from neighbor
+		iPredMode = PredIntra4x4Mode(pMbCache->iIntraPredMode, kpCache48CountScan4[i]);
+		//step 3: collect candidates of iPredMode		
+		iAvailCount = kpIntra4x4AvailCount[kiOffset];
+		kpAvailMode = g_kiIntra4AvailMode[kiOffset];
+
+		if (iAvailCount == 9 || iAvailCount == 7)
+		{
+			//I4_PRED_DC(2)
+
+			iBestMode = I4_PRED_DC;
+
+			pDst = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
+
+			pFunc->pfGetLumaI4x4Pred[I4_PRED_DC](pDst, pCurDec, kiLineSizeDec);
+			iBestCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+				lambda[iPredMode == g_kiMapModeI4x4[iBestMode]];	
+
+			//I4_PRED_H(1)			
+			iCurMode = I4_PRED_H; 
+
+			pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+			pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+			iCostH = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+				lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+			if (iCurCost < iBestCost) 
+			{ 
+				iBestMode = iCurMode; 
+				iBestCost = iCurCost; 
+				iBestPredBufferNum = 1 - iBestPredBufferNum;
+			}
+
+			//I4_PRED_V(0)
+			iCurMode = I4_PRED_V; 
+
+			pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+			pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+			iCostV = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+				lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+			if (iCurCost < iBestCost) 
+			{ 
+				iBestMode = iCurMode; 
+				iBestCost = iCurCost; 
+				iBestPredBufferNum = 1 - iBestPredBufferNum;
+			}
+			if ( iCostV < iCostH )
+			{
+				if (iAvailCount == 9)
+				{
+					iBestModeFake = true; //indicating whether V is the best fake mode
+
+					//I4_PRED_VR(5) and I4_PRED_VL(7)
+					iCurMode = I4_PRED_VR; 
+
+					pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+					pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+					iCostVR = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+						lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+					if (iCurCost < iBestCost) 
+					{ 
+						iBestMode = iCurMode; 
+						iBestCost = iCurCost; 
+						iBestPredBufferNum = 1 - iBestPredBufferNum;
+					}
+
+					if (iCurCost < iCostV) 
+						iBestModeFake = false;
+
+					iCurMode = I4_PRED_VL; 
+
+					pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+					pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+					iCostVL = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+						lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+					if (iCurCost < iBestCost) 
+					{ 
+						iBestMode = iCurMode; 
+						iBestCost = iCurCost; 
+						iBestPredBufferNum = 1 - iBestPredBufferNum;
+					}
+
+					if (iCurCost < iCostV) 
+						iBestModeFake = false;	
+
+					//Vertical Early Determination
+					if ( !iBestModeFake ) //Vertical is not the best, go on checking...
+					{
+						//select the best one from VL and VR
+						if (iCostVR < iCostVL)
+						{
+							//I4_PRED_DDR(4)
+							iCurMode = I4_PRED_DDR; 
+
+							pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+							pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+
+							iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+								lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+							if (iCurCost < iBestCost) 
+							{ 
+								iBestMode = iCurMode; 
+								iBestCost = iCurCost; 
+								iBestPredBufferNum = 1 - iBestPredBufferNum;
+							}
+						}
+						else
+						{
+							//I4_PRED_DDL(3)
+							iCurMode = I4_PRED_DDL;
+
+							pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+							pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+
+							iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+								lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+							if (iCurCost < iBestCost) 
+							{ 
+								iBestMode = iCurMode; 
+								iBestCost = iCurCost; 
+								iBestPredBufferNum = 1 - iBestPredBufferNum;
+							}
+						}
+					}
+				}
+				else if(iAvailCount == 7)
+				{
+					iCurMode = I4_PRED_DDR; 
+
+					pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+					pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 				
+					iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+						lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+					if (iCurCost < iBestCost) 
+					{ 
+						iBestMode = iCurMode; 
+						iBestCost = iCurCost; 
+						iBestPredBufferNum = 1 - iBestPredBufferNum;
+					}			
+
+					iCurMode = I4_PRED_VR; 
+
+					pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+					pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+
+					iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+						lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+					if (iCurCost < iBestCost) 
+					{ 
+						iBestMode = iCurMode; 
+						iBestCost = iCurCost; 
+						iBestPredBufferNum = 1 - iBestPredBufferNum;
+					}			
+				}
+			}
+			else
+			{
+				iBestModeFake = true; //indicating whether H is the best fake mode
+				//I4_PRED_HD(6) and I4_PRED_HU(8)
+				iCurMode = I4_PRED_HD; 
+
+				pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+				pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+				iCostHD = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+					lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+				if (iCurCost < iBestCost) 
+				{ 
+					iBestMode = iCurMode; 
+					iBestCost = iCurCost; 
+					iBestPredBufferNum = 1 - iBestPredBufferNum;
+				}			
+
+				if (iCurCost < iCostH) 
+					iBestModeFake = false;
+
+				iCurMode = I4_PRED_HU; 
+
+				pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+				pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+				iCostHU = iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+					lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+				if (iCurCost < iBestCost) 
+				{ 
+					iBestMode = iCurMode; 
+					iBestCost = iCurCost; 
+					iBestPredBufferNum = 1 - iBestPredBufferNum;
+				}			
+
+				if (iCurCost < iCostH) 
+					iBestModeFake = false;	
+
+				if ( !iBestModeFake ) //Horizontal is not the best, go on checking...
+				{
+					//select the best one from VL and VR
+					if (iCostHD < iCostHU)
+					{
+						//I4_PRED_DDR(4)
+						iCurMode = I4_PRED_DDR; 
+
+						pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+						pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+						iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+							lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+						if (iCurCost < iBestCost) 
+						{ 
+							iBestMode = iCurMode; 
+							iBestCost = iCurCost; 
+							iBestPredBufferNum = 1 - iBestPredBufferNum;
+						}			
+					}
+					else if(iAvailCount == 9)
+					{
+						//I4_PRED_DDL(3)
+						iCurMode = I4_PRED_DDL; 
+
+						pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+						pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+
+						iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+							lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+						if (iCurCost < iBestCost) 
+						{ 
+							iBestMode = iCurMode; 
+							iBestCost = iCurCost; 
+							iBestPredBufferNum = 1 - iBestPredBufferNum;
+						}
+
+					}
+				}
+			}
+		}
+		else
+		{
+			iBestCost = INT_MAX;
+       		iBestMode = I4_PRED_INVALID;
+			for (j = 0; j < iAvailCount; j++)
+			{
+				// I4x4_MODE_CHECK(pAvailMode[j], iCurCost);
+				iCurMode = kpAvailMode[j]; 
+
+				pDst = &pMbCache->pMemPredBlk4[(1 - iBestPredBufferNum) << 4];
+
+				pFunc->pfGetLumaI4x4Pred[iCurMode](pDst, pCurDec, kiLineSizeDec); 
+				iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_4x4](pDst, 4, pCurEnc, kiLineSizeEnc) +
+					lambda[iPredMode == g_kiMapModeI4x4[iCurMode]];
+
+				if (iCurCost < iBestCost) 
+				{ 
+					iBestMode = iCurMode; 
+					iBestCost = iCurCost; 
+					iBestPredBufferNum = 1 - iBestPredBufferNum;
+				}			
+			}
+		}
+		pMbCache->pBestPredI4x4Blk4 = &pMbCache->pMemPredBlk4[iBestPredBufferNum << 4];
+		iCosti4x4 += iBestCost;
+		if (iCosti4x4 >= iBestCostLuma)
+		{
+			break;
+		}
+
+		//step 5: update pred mode and sample avail cache
+		iFinalMode = g_kiMapModeI4x4[iBestMode];
+		if (iPredMode == iFinalMode)
+		{
+			*pPrevIntra4x4PredModeFlag++ = true;
+		}
+		else
+		{
+			*pPrevIntra4x4PredModeFlag++ = false;
+			*pRemIntra4x4PredModeFlag  = (iFinalMode < iPredMode ? iFinalMode: (iFinalMode-1));						
+		}
+		pRemIntra4x4PredModeFlag++;
+		//	pCurMb->pIntra4x4PredMode[scan4[i]] = iFinalMode;
+	    pMbCache->iIntraPredMode[kpCache48CountScan4[i]] = iFinalMode;
+		//step 6: encoding I_4x4 
+		WelsEncRecI4x4Y(pEncCtx, pCurMb, pMbCache, i);
+	}
+	ST32(pCurMb->pIntra4x4PredMode, LD32(&pMbCache->iIntraPredMode[33]));
+	pCurMb->pIntra4x4PredMode[4] = pMbCache->iIntraPredMode[12];
+	pCurMb->pIntra4x4PredMode[5] =	pMbCache->iIntraPredMode[20];
+	pCurMb->pIntra4x4PredMode[6] = pMbCache->iIntraPredMode[28];
+	iCosti4x4 += (iLambda << 4) + (iLambda << 3); //4*6*lambda from JVT SATD0	
+	return iCosti4x4;
+}
+
+int32_t WelsMdIntraChroma(SWelsFuncPtrList *pFunc, SDqLayer *pCurDqLayer, SMbCache *pMbCache, int32_t iLambda)
+{
+	const int8_t *kpAvailMode;
+	int32_t iAvailCount				= 0;
+	int32_t iChmaIdx = 0;
+	uint8_t *pPredIntraChma[2]	= {pMbCache->pMemPredChroma, pMbCache->pMemPredChroma + 128};
+	uint8_t *pDstChma				= pPredIntraChma[0];
+	uint8_t *pEncCb				= pMbCache->SPicData.pEncMb[1];
+	uint8_t *pEncCr				= pMbCache->SPicData.pEncMb[2];
+	uint8_t *pDecCb				= pMbCache->SPicData.pCsMb[1];//pMbCache->SPicData.pDecMb[1];
+	uint8_t *pDecCr				= pMbCache->SPicData.pCsMb[2];//pMbCache->SPicData.pDecMb[2];
+	const int32_t kiLineSizeEnc		= pCurDqLayer->iEncStride[1];
+	const int32_t kiLineSizeDec		= pCurDqLayer->iCsStride[1];//pMbCache->SPicData.i_stride_dec[1];
+
+	int32_t i, iCurMode, iCurCost, iBestMode, iBestCost = INT_MAX;
+
+	int32_t iOffset = pMbCache->uiNeighborIntra&0x07;
+	iAvailCount = g_kiIntraChromaAvailMode[iOffset][4];
+	kpAvailMode = g_kiIntraChromaAvailMode[iOffset];
+	if (iAvailCount > 3 && pFunc->sSampleDealingFuncs.pfIntra8x8Combined3 )
+	{
+		iBestCost = pFunc->sSampleDealingFuncs.pfIntra8x8Combined3(pDecCb,kiLineSizeDec,pEncCb,kiLineSizeEnc,&iBestMode,
+			iLambda, pDstChma,pDecCr,pEncCr);
+		iCurMode = kpAvailMode[3];
+		pFunc->pfGetChromaPred[iCurMode](pDstChma, pDecCb, kiLineSizeDec);//Cb
+		pFunc->pfGetChromaPred[iCurMode](pDstChma+64, pDecCr, kiLineSizeDec);//Cr
+
+		iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8](pDstChma, 8, pEncCb, kiLineSizeEnc) +
+			pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8](pDstChma+64, 8, pEncCr, kiLineSizeEnc) +
+			iLambda * 4;
+		if (iCurCost < iBestCost)
+		{
+			iBestMode = iCurMode;
+			iBestCost = iCurCost;
+		}
+		else
+		{
+			pFunc->pfGetChromaPred[iBestMode](pDstChma, pDecCb, kiLineSizeDec);//Cb
+			pFunc->pfGetChromaPred[iBestMode](pDstChma+64, pDecCr, kiLineSizeDec);//Cr
+		}
+		iBestCost += iLambda;
+		iChmaIdx = 1;
+	}
+	else{
+		iBestMode = kpAvailMode[0];
+		for ( i = 0; i < iAvailCount; ++ i )
+		{
+			iCurMode = kpAvailMode[i];
+
+			assert( iCurMode >= 0 && iCurMode < 7 );
+
+			//		pDstCb	= &pMbCache->mem_pred_intra_cb[iCurMode<<6];
+			//		pDstCr	= &pMbCache->mem_pred_intra_cr[iCurMode<<6];
+			pFunc->pfGetChromaPred[iCurMode](pDstChma, pDecCb, kiLineSizeDec);//Cb
+			iCurCost = pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8](pDstChma, 8, pEncCb, kiLineSizeEnc);
+
+			pFunc->pfGetChromaPred[iCurMode](pDstChma+64, pDecCr, kiLineSizeDec);//Cr
+			iCurCost += pFunc->sSampleDealingFuncs.pfMdCost[BLOCK_8x8](pDstChma+64, 8, pEncCr, kiLineSizeEnc) +
+				iLambda * BsSizeUE( g_kiMapModeIntraChroma[iCurMode] );
+			if (iCurCost < iBestCost)
+			{
+				iBestMode = iCurMode;
+				iBestCost = iCurCost;
+				iChmaIdx= iChmaIdx ^ 0x01;
+				pDstChma	= pPredIntraChma[iChmaIdx];
+			}
+		}
+	}
+
+	pMbCache->pBestPredIntraChroma	= pPredIntraChma[iChmaIdx ^ 0x01];
+	pMbCache->uiChmaI8x8Mode = iBestMode;
+	return iBestCost;
+}
+int32_t WelsMdIntraFinePartition(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
+{
+	sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;	
+	SWelsMD* pWelsMd = (SWelsMD*)pMd;
+    
+	int32_t iCosti4x4 = WelsMdI4x4( pEncCtx, pWelsMd, pCurMb, pMbCache);
+	
+	if (iCosti4x4 < pWelsMd->iCostLuma)
+	{
+		pCurMb->uiMbType = MB_TYPE_INTRA4x4;
+		pWelsMd->iCostLuma = iCosti4x4;
+	}
+	return pWelsMd->iCostLuma;
+}
+
+int32_t WelsMdIntraFinePartitionVaa(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
+{	
+	sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;	
+	SWelsMD* pWelsMd = (SWelsMD*)pMd;
+    
+	if ( MdIntraAnalysisVaaInfo( pEncCtx, pMbCache->SPicData.pEncMb[0] ) )
+	{
+		int32_t iCosti4x4 = WelsMdI4x4Fast( pEncCtx, pWelsMd, pCurMb, pMbCache );	
+		
+		if (iCosti4x4 < pWelsMd->iCostLuma)
+		{
+			pCurMb->uiMbType = MB_TYPE_INTRA4x4;
+			pWelsMd->iCostLuma = iCosti4x4;
+		}						
+    }
+
+    return pWelsMd->iCostLuma;
+}
+
+void WelsMdIntraMb(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache)
+{		
+	//initial prediction memory for I_16x16
+	pWelsMd->iCostLuma = WelsMdI16x16(pEncCtx->pFuncList, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+	pCurMb->uiMbType = MB_TYPE_INTRA16x16;
+
+	WelsMdIntraSecondaryModesEnc( pEncCtx, pWelsMd, pCurMb, pMbCache );
+}
+
+int32_t WelsMdP16x16(SWelsFuncPtrList *pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb)
+{
+	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+	SWelsME* sMe16x16 = &pWelsMd->sMe.sMe16x16;
+	uint32_t uiNeighborAvail = pCurMb->uiNeighborAvail;
+	const int32_t kiMbWidth	= pCurLayer->iMbWidth;	// for assign once
+	const int32_t kiMbHeight	= pCurLayer->iMbHeight;
+
+	sMe16x16->uiPixel = BLOCK_16x16;
+	sMe16x16->pMvdCost= pWelsMd->pMvdCost;
+
+	sMe16x16->pEncMb  = pMbCache->SPicData.pEncMb[0];
+	sMe16x16->pRefMb  = pMbCache->SPicData.pRefMb[0];
+	sMe16x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb;
+
+	pSlice->uiMvcNum = 0;
+	pSlice->sMvc[pSlice->uiMvcNum++] = sMe16x16->sMvBase;
+	//spatial motion vector predictors
+	if(uiNeighborAvail & LEFT_MB_POS) //left available
+	{
+		pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb-1)->sP16x16Mv;
+	}
+	if(uiNeighborAvail & TOP_MB_POS) //top available
+	{
+		pSlice->sMvc[pSlice->uiMvcNum++] = (pCurMb-kiMbWidth)->sP16x16Mv;		
+	}
+	//temporal motion vector predictors
+	if (pCurLayer->pRefPic->iPictureType == P_SLICE)
+	{
+		if (pCurMb->iMbX < kiMbWidth - 1)
+		{
+			SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY+1];
+			pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
+			pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
+			++ pSlice->uiMvcNum;
+		}
+		if (pCurMb->iMbY < kiMbHeight - 1)
+		{
+			SMVUnitXY sTempMv = pCurLayer->pRefPic->sMvList[pCurMb->iMbXY+kiMbWidth];
+			pSlice->sMvc[pSlice->uiMvcNum].iMvX = sTempMv.iMvX >> pSlice->sScaleShift;
+			pSlice->sMvc[pSlice->uiMvcNum].iMvY = sTempMv.iMvY >> pSlice->sScaleShift;
+			++ pSlice->uiMvcNum;
+		}
+	}
+
+	PredMv(&pMbCache->sMvComponents, 0, 4, 0, &(sMe16x16->sMvp));
+	pFunc->pfMotionSearch(pFunc, pCurLayer, sMe16x16, pSlice);	
+//	update_p16x16_motion2cache(pMbCache, pWelsMd->uiRef, &(sMe16x16->mv));
+
+	pCurMb->sP16x16Mv = sMe16x16->sMv;
+	pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMe16x16->sMv;
+
+	return sMe16x16->uiSatdCost;
+}
+int32_t WelsMdP16x8(SWelsFuncPtrList *pFunc, SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice)
+{
+	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+	int32_t iStrideEnc = pCurDqLayer->iEncStride[0];
+	int32_t iStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
+	SWelsME* sMe16x8; 
+	int32_t i = 0;
+	int32_t iCostP16x8 = 0;
+	do 
+	{
+		sMe16x8 = &pWelsMd->sMe.sMe16x8[i];
+
+		sMe16x8->uiPixel = BLOCK_16x8;
+		sMe16x8->pMvdCost	 = pWelsMd->pMvdCost;
+
+		sMe16x8->pEncMb       = pMbCache->SPicData.pEncMb[0] + ((i << 3) * iStrideEnc);
+		sMe16x8->pRefMb       = pMbCache->SPicData.pRefMb[0] + ((i << 3) * iStrideRef);		
+		sMe16x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
+
+		pSlice->sMvc[0]	= sMe16x8->sMvBase;
+		pSlice->uiMvcNum = 1;
+
+		PredInter16x8Mv(pMbCache, i<<3, 0, &(sMe16x8->sMvp));
+		pFunc->pfMotionSearch(pFunc, pCurDqLayer, sMe16x8, pSlice);		
+		UpdateP16x8Motion2Cache(pMbCache, i<<3, pWelsMd->uiRef, &(sMe16x8->sMv));
+        iCostP16x8 += sMe16x8->uiSatdCost;
+		++i;
+	} while(i < 2);
+	return iCostP16x8;
+}
+int32_t WelsMdP8x16(SWelsFuncPtrList *pFunc, SDqLayer* pCurLayer, SWelsMD* pWelsMd, SSlice *pSlice)
+{
+	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+	SWelsME* sMe8x16;
+	int32_t i = 0;
+	int32_t iCostP8x16 = 0;
+	do 
+	{
+		sMe8x16 = &pWelsMd->sMe.sMe8x16[i];
+
+		sMe8x16->uiPixel = BLOCK_8x16;
+		sMe8x16->pMvdCost     = pWelsMd->pMvdCost;
+
+		sMe8x16->pEncMb       = pMbCache->SPicData.pEncMb[0] + (i << 3);
+		sMe8x16->pRefMb       = pMbCache->SPicData.pRefMb[0] + (i << 3);			
+		sMe8x16->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 1;
+		
+		pSlice->sMvc[0] = sMe8x16->sMvBase;
+		pSlice->uiMvcNum = 1;		
+
+		PredInter8x16Mv(pMbCache, i<<2, 0, &(sMe8x16->sMvp));
+		pFunc->pfMotionSearch(pFunc, pCurLayer, sMe8x16, pSlice);		
+		UpdateP8x16Motion2Cache(pMbCache, i<<2, pWelsMd->uiRef, &(sMe8x16->sMv));
+		iCostP8x16 += sMe8x16->uiSatdCost;
+//		sMe8x16++;
+		++i;
+	} while(i < 2);
+	return iCostP8x16;
+}
+int32_t WelsMdP8x8(SWelsFuncPtrList *pFunc,SDqLayer* pCurDqLayer, SWelsMD* pWelsMd, SSlice *pSlice)
+{
+	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+	int32_t iLineSizeEnc = pCurDqLayer->iEncStride[0];
+	int32_t iLineSizeRef = pCurDqLayer->pRefPic->iLineSize[0];
+	SWelsME* sMe8x8;
+	int32_t i, iIdxX, iIdxY, iStrideEnc, iStrideRef;
+	int32_t iCostP8x8 = 0;
+	for (i = 0; i < 4; i++)
+	{
+		iIdxX = i & 1;
+		iIdxY = i >> 1;
+		iStrideEnc = (iIdxX << 3) + ((iIdxY << 3) * iLineSizeEnc);
+		iStrideRef = (iIdxX << 3) + ((iIdxY << 3) * iLineSizeRef);
+
+		sMe8x8 = &pWelsMd->sMe.sMe8x8[i];
+
+		sMe8x8->uiPixel = BLOCK_8x8;
+		sMe8x8->pMvdCost     = pWelsMd->pMvdCost;		
+
+		sMe8x8->pEncMb       = pMbCache->SPicData.pEncMb[0] + iStrideEnc;
+		sMe8x8->pRefMb       = pMbCache->SPicData.pRefMb[0] + iStrideRef;			
+		sMe8x8->uSadPredISatd.uiSadPred = pWelsMd->iSadPredMb >> 2;
+
+		pSlice->sMvc[0] = sMe8x8->sMvBase;
+		pSlice->uiMvcNum = 1;		
+
+		PredMv(&pMbCache->sMvComponents, i<<2, 2, pWelsMd->uiRef, &(sMe8x8->sMvp));
+		pFunc->pfMotionSearch(pFunc, pCurDqLayer, sMe8x8, pSlice);		
+		UpdateP8x8Motion2Cache(pMbCache, i<<2, pWelsMd->uiRef, &(sMe8x8->sMv));
+        iCostP8x8 += sMe8x8->uiSatdCost;
+//		sMe8x8++;
+	}
+    return iCostP8x8;
+}
+
+void WelsMdInterFinePartition(void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, int32_t iBestCost)
+{	
+	sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;	
+	SWelsMD* pWelsMd = (SWelsMD*)pMd;
+    
+	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+//	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+	int32_t iCost = 0;
+
+//	WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+	
+	iCost = WelsMdP8x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+
+	if (iCost < iBestCost)
+	{
+		int32_t iCostPart;
+		pCurMb->uiMbType = MB_TYPE_8x8;
+			
+//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+		iCostPart = WelsMdP16x8(pEncCtx->pFuncList,pCurDqLayer, pWelsMd, pSlice);
+		if (iCostPart <= iCost)
+		{
+			iCost = iCostPart;
+			pCurMb->uiMbType = MB_TYPE_16x8;
+			//pCurMb->mb_partition = 2;
+		}
+		
+//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+		iCostPart = WelsMdP8x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+		if (iCostPart <= iCost)
+		{
+			iCost = iCostPart;
+			pCurMb->uiMbType = MB_TYPE_8x16;
+			//pCurMb->mb_partition = 2;
+		}
+	}
+}
+
+void WelsMdInterFinePartitionVaa( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, int32_t iBestCost )
+{	
+	sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pEnc;	
+	SWelsMD* pWelsMd = (SWelsMD*)pMd;
+    
+	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+//	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+	int32_t iCostP8x16, iCostP16x8, iCostP8x8;
+	uint8_t uiMbSign = pEncCtx->pFuncList->pfGetMbSignFromInterVaa( &pEncCtx->pVaa->sVaaCalcInfo.pSad8x8[pCurMb->iMbXY][0] );			
+	
+ 	if ( uiMbSign == 15 )
+	{
+		return;
+	}
+
+//	iCost = pWelsMd->sMe16x16.uiSatdCost;
+	
+	switch ( uiMbSign )
+	{
+	case 3:
+	case 12:
+//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP16x8, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+		iCostP16x8 = WelsMdP16x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice );
+ 		if ( iCostP16x8 < iBestCost )
+		{
+			iBestCost = iCostP16x8;
+			pCurMb->uiMbType = MB_TYPE_16x8;
+			//pCurMb->mb_partition = 2;
+		}
+ 		break;
+
+	case 5:
+	case 10:
+//		WelsLog( pEncCtx, WELS_LOG_INFO, "WelsMdP8x16, p_ref[0]= 0x%p\n", pMbCache->SPicData.pRefMb[0]);
+		iCostP8x16 = WelsMdP8x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice );
+ 		if ( iCostP8x16 < iBestCost )
+		{
+			iBestCost = iCostP8x16;
+			pCurMb->uiMbType = MB_TYPE_8x16;
+			//pCurMb->mb_partition = 2;
+		}
+		break;
+		
+	case 6:
+	case 9:
+		iCostP8x8 = WelsMdP8x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice );	
+		if ( iCostP8x8 < iBestCost )
+		{
+			iBestCost = iCostP8x8;
+			pCurMb->uiMbType = MB_TYPE_8x8;
+		}
+		break;
+
+	default:
+		iCostP8x8 = WelsMdP8x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice );
+		if (iCostP8x8 < iBestCost )
+		{
+			iBestCost = iCostP8x8;
+			pCurMb->uiMbType = MB_TYPE_8x8;
+
+			iCostP16x8 = WelsMdP16x8(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+			if (iCostP16x8 <= iBestCost)
+			{
+				iBestCost = iCostP16x8;
+				pCurMb->uiMbType = MB_TYPE_16x8;
+			}
+			
+			iCostP8x16 = WelsMdP8x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice);
+			if (iCostP8x16 <= iBestCost)
+			{
+				iBestCost = iCostP8x16;
+				pCurMb->uiMbType = MB_TYPE_8x16;
+			}						
+		}
+		break;
+	}
+	pWelsMd->iCostLuma = iBestCost;
+}
+
+
+inline void VaaBackgroundMbDataUpdate(  SWelsFuncPtrList *pFunc, SVAAFrameInfo *pVaaInfo, SMB* pCurMb )
+{
+	const int32_t kiPicStride		= pVaaInfo->iPicStride;
+	const int32_t kiPicStrideUV	= pVaaInfo->iPicStrideUV;
+	const int32_t kiOffsetY		= (pCurMb->iMbY*kiPicStride+pCurMb->iMbX)<<4;
+	const int32_t kiOffsetUV		= (pCurMb->iMbY*kiPicStrideUV+pCurMb->iMbX)<<3;
+
+	pFunc->pfCopy16x16Aligned(pVaaInfo->pCurY+kiOffsetY, kiPicStride, pVaaInfo->pRefY+kiOffsetY, kiPicStride);
+	pFunc->pfCopy8x8Aligned(pVaaInfo->pCurU+kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefU+kiOffsetUV, kiPicStrideUV);
+	pFunc->pfCopy8x8Aligned(pVaaInfo->pCurV+kiOffsetUV, kiPicStrideUV, pVaaInfo->pRefV+kiOffsetUV, kiPicStrideUV);
+}
+
+void WelsMdBackgroundMbEnc(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache, SSlice *pSlice, bool_t bSkipMbFlag) 
+{
+	sWelsEncCtx * pEncCtx	= (sWelsEncCtx *)pEnc;
+	SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;
+	SWelsMD * pWelsMd		= (SWelsMD *)pMd;
+	SWelsFuncPtrList *pFunc	= pEncCtx->pFuncList;
+	SMVUnitXY sMvp				= { 0 };
+	uint8_t* pRefLuma			= pMbCache->SPicData.pRefMb[0];
+	uint8_t* pRefCb				= pMbCache->SPicData.pRefMb[1];
+	uint8_t* pRefCr				= pMbCache->SPicData.pRefMb[2];
+	int32_t iLineSizeY			= pCurDqLayer->pRefPic->iLineSize[0];
+	int32_t iLineSizeUV			= pCurDqLayer->pRefPic->iLineSize[1];
+	uint8_t* pDstLuma			= pMbCache->pSkipMb;
+	uint8_t* pDstCb				= pMbCache->pSkipMb+256;
+	uint8_t* pDstCr				= pMbCache->pSkipMb+256+64;
+
+	if (!bSkipMbFlag)
+	{
+		pDstLuma	= pMbCache->pMemPredLuma;
+		pDstCb	= pMbCache->pMemPredChroma;
+		pDstCr	= pMbCache->pMemPredChroma+64;
+	}
+	//MC
+	pFunc->sMcFuncs.pfLumaQuarpelMc[0](pRefLuma, iLineSizeY, pDstLuma, 16,16);
+	pFunc->sMcFuncs.pfChromaMc(pRefCb, iLineSizeUV, pDstCb, 8, sMvp, 8, 8);//Cb
+	pFunc->sMcFuncs.pfChromaMc(pRefCr, iLineSizeUV, pDstCr, 8, sMvp, 8, 8);//Cr
+
+	pCurMb->uiCbp = 0;
+	pMbCache->bCollocatedPredFlag = true;
+	pWelsMd->iCostLuma = 0;//BGD&RC integration
+	pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY );	
+	ST32(&pCurMb->sP16x16Mv, 0);
+	ST32(&pCurDqLayer->pDecPic->sMvList[pCurMb->iMbXY], 0);
+
+	if (bSkipMbFlag)
+	{
+		pCurMb->uiMbType = MB_TYPE_BACKGROUND;
+
+		//update motion info to current MB
+		ST32(pCurMb->pRefIndex, 0);
+		pFunc->pfUpdateMbMv(pCurMb->sMv, sMvp);		
+
+		pCurMb->uiLumaQp   = pSlice->uiLastMbQp;
+		pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
+
+		WelsRecPskip(pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache);
+		VaaBackgroundMbDataUpdate(pEncCtx->pFuncList, pEncCtx->pVaa, pCurMb);
+		return;
+	}
+
+	pCurMb->uiMbType = MB_TYPE_16x16;
+
+	pWelsMd->sMe.sMe16x16.sMv.iMvX = 0;
+	pWelsMd->sMe.sMe16x16.sMv.iMvY = 0;
+	PredMv( &pMbCache->sMvComponents, 0, 4, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMvp );
+	pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
+
+	UpdateP16x16MotionInfo(pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
+
+	if(pWelsMd->bMdUsingSad)
+		pWelsMd->iCostLuma = pCurMb->pSadCost[0];
+	else
+		pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurDqLayer->iEncStride[0], pRefLuma, iLineSizeY );	
+
+	WelsInterMbEncode( pEncCtx, pSlice, pCurMb );
+	WelsPMbChromaEncode( pEncCtx, pSlice, pCurMb );
+
+	pFunc->pfCopy16x16Aligned( pMbCache->SPicData.pCsMb[0], pCurDqLayer->iCsStride[0], pMbCache->pMemPredLuma,     16 );
+	pFunc->pfCopy8x8Aligned(   pMbCache->SPicData.pCsMb[1], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma,    8 );
+	pFunc->pfCopy8x8Aligned(   pMbCache->SPicData.pCsMb[2], pCurDqLayer->iCsStride[1], pMbCache->pMemPredChroma+64, 8 );
+}
+
+BOOL_T WelsMdPSkipEnc(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache) 
+{
+	sWelsEncCtx *pEncCtx	= (sWelsEncCtx *)pEnc;
+	SDqLayer *pCurLayer				= pEncCtx->pCurDqLayer;
+	SWelsMD *pWelsMd					= (SWelsMD *)pMd;
+	SWelsFuncPtrList *pFunc		= pEncCtx->pFuncList;
+
+	uint8_t* pRefLuma = pMbCache->SPicData.pRefMb[0];
+	uint8_t* pRefCb   = pMbCache->SPicData.pRefMb[1];
+	uint8_t* pRefCr   = pMbCache->SPicData.pRefMb[2];
+	int32_t iLineSizeY  = pCurLayer->pRefPic->iLineSize[0];
+	int32_t iLineSizeUV = pCurLayer->pRefPic->iLineSize[1];
+
+	uint8_t* pDstLuma = pMbCache->pSkipMb;
+	uint8_t* pDstCb   = pMbCache->pSkipMb+256;
+	uint8_t* pDstCr   = pMbCache->pSkipMb+256+64;
+
+	SMVUnitXY sMvp = { 0 };
+    uint8_t uiMvpIdx;
+	int32_t n;
+
+	int32_t iEncStride		= pCurLayer->iEncStride[0];
+	uint8_t* pEncMb			= pMbCache->SPicData.pEncMb[0];
+	int32_t *pStrideEncBlockOffset= pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId]; 
+	int32_t *pEncBlockOffset;
+
+	int32_t iSadCostLuma = 0;
+	int32_t iSadCostChroma = 0;
+	int32_t iSadCostMb = 0;
+
+	PredSkipMv(pMbCache, &sMvp);
+	
+	// Special case, need to clip the vector //
+	SMVUnitXY sQpelMvp = { sMvp.iMvX>>2, sMvp.iMvY>>2 };
+    n = (pCurMb->iMbX<<4) + sQpelMvp.iMvX;
+    if( n < -29 )
+        return FALSE;
+    else if( n > (int32_t)((pCurLayer->iMbWidth<<4) + 12) )
+        return FALSE;
+
+    n = (pCurMb->iMbY<<4) + sQpelMvp.iMvY;
+    if( n < -29 )
+        return FALSE;
+    else if( n > (int32_t)((pCurLayer->iMbHeight<<4) + 12) )
+        return FALSE;
+
+	//luma	
+	pRefLuma += sQpelMvp.iMvY * iLineSizeY + sQpelMvp.iMvX;
+    uiMvpIdx = ((sMvp.iMvY&0x03)<<2)+(sMvp.iMvX&0x03);
+	pFunc->sMcFuncs.pfLumaQuarpelMc[uiMvpIdx](pRefLuma, iLineSizeY, pDstLuma, 16,16);
+	iSadCostLuma    = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pDstLuma, 16 );	
+
+	const int32_t iStrideUV = (sQpelMvp.iMvY>>1) * iLineSizeUV + (sQpelMvp.iMvX>>1);
+	pRefCb += iStrideUV;
+	pFunc->sMcFuncs.pfChromaMc(pRefCb, iLineSizeUV, pDstCb, 8, sMvp, 8, 8);//Cb	
+	iSadCostChroma  = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8](pMbCache->SPicData.pEncMb[1], pCurLayer->iEncStride[1], pDstCb, 8 );	
+	
+	pRefCr += iStrideUV;
+	pFunc->sMcFuncs.pfChromaMc(pRefCr, iLineSizeUV, pDstCr, 8, sMvp, 8, 8);//Cr
+	iSadCostChroma += pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8](pMbCache->SPicData.pEncMb[2], pCurLayer->iEncStride[2], pDstCr, 8 );	
+
+	iSadCostMb = iSadCostLuma + iSadCostChroma;
+
+	if ( iSadCostMb == 0                             ||
+		 iSadCostMb < pWelsMd->iSadPredSkip   || 
+	   ( pCurLayer->pRefPic->iPictureType == P_SLICE     && 
+	     pMbCache->uiRefMbType == MB_TYPE_SKIP    &&
+		 iSadCostMb < pCurLayer->pRefPic->pMbSkipSad[pCurMb->iMbXY]) )
+	{
+		//update motion info to current MB
+		ST32(pCurMb->pRefIndex, 0);
+		pFunc->pfUpdateMbMv(pCurMb->sMv, sMvp);		
+
+		pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pRefLuma, iLineSizeY );
+		
+		if (pWelsMd->bMdUsingSad)
+			pWelsMd->iCostLuma = pCurMb->pSadCost[0];
+		else
+			pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pRefLuma, iLineSizeY );	
+	
+		pWelsMd->iCostSkipMb = iSadCostMb;	
+
+		pCurMb->sP16x16Mv = sMvp;
+		pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
+		
+		return TRUE;
+	}	
+
+	WelsDctMb(pMbCache->pCoeffLevel,  pEncMb, iEncStride, pDstLuma, pEncCtx->pFuncList->pfDctFourT4 );
+
+	if ( WelsTryPYskip( pEncCtx, pCurMb, pMbCache ) )
+	{
+		iEncStride = pEncCtx->pCurDqLayer->iEncStride[1];
+		pEncMb = pMbCache->SPicData.pEncMb[1];
+		pEncBlockOffset = pStrideEncBlockOffset + 16;
+		pFunc->pfDctFourT4( pMbCache->pCoeffLevel + 256, &(pEncMb[*pEncBlockOffset]), iEncStride,	pMbCache->pSkipMb + 256, 8);
+		if ( WelsTryPUVskip( pEncCtx, pCurMb, pMbCache, 1 ) )
+		{
+			pEncMb = pMbCache->SPicData.pEncMb[2];
+			pEncBlockOffset = pStrideEncBlockOffset + 20;
+			pFunc->pfDctFourT4( pMbCache->pCoeffLevel + 320, &(pEncMb[*pEncBlockOffset]), iEncStride,	pMbCache->pSkipMb + 320, 8);
+			if ( WelsTryPUVskip( pEncCtx, pCurMb, pMbCache, 2 ) )
+			{
+				//update motion info to current MB
+				ST32(pCurMb->pRefIndex, 0);
+				pFunc->pfUpdateMbMv(pCurMb->sMv, sMvp);				
+
+				pCurMb->pSadCost[0] = pFunc->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pRefLuma, iLineSizeY );
+
+                if (pWelsMd->bMdUsingSad)
+					pWelsMd->iCostLuma = pCurMb->pSadCost[0];
+				else
+					pWelsMd->iCostLuma = pFunc->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurLayer->iEncStride[0], pRefLuma, iLineSizeY );	
+				
+				pWelsMd->iCostSkipMb = iSadCostMb;
+
+				pCurMb->sP16x16Mv = sMvp;
+				pCurLayer->pDecPic->sMvList[pCurMb->iMbXY] = sMvp;
+
+				return TRUE;
+			}
+		}
+	}
+	return FALSE;
+}
+
+const int32_t g_kiPixStrideIdx8x8[4] = {  0,                                             ME_REFINE_BUF_WIDTH_BLK8,
+									ME_REFINE_BUF_STRIDE_BLK8, ME_REFINE_BUF_STRIDE_BLK8 + ME_REFINE_BUF_WIDTH_BLK8};
+
+void WelsMdInterMbRefinement(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache)
+{
+	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+	uint8_t *pTmpRefCb, *pTmpRefCr, *pTmpDstCb, *pTmpDstCr;
+	int32_t iMvStride, iRefBlk4Stride, iDstBlk4Stride; 	
+	SMVUnitXY* pMv;
+	int32_t iBestSadCost = 0, iBestSatdCost = 0;	
+	SMeRefinePointer sMeRefine;
+
+	int32_t i, iIdx, iPixStride;
+
+	uint8_t* pRefCb = pMbCache->SPicData.pRefMb[1];
+	uint8_t* pRefCr = pMbCache->SPicData.pRefMb[2]; 
+	uint8_t* pDstCb = pMbCache->pMemPredChroma;
+	uint8_t* pDstCr = pMbCache->pMemPredChroma+64;
+	uint8_t* pDstLuma = pMbCache->pMemPredLuma;
+
+	int32_t iLineSizeRefUV = pCurDqLayer->pRefPic->iLineSize[1];
+	
+	switch(pCurMb->uiMbType)
+	{
+	case MB_TYPE_16x16:			
+		//luma
+		InitMeRefinePointer(&sMeRefine, pMbCache, 0);
+		MeRefineFracPixel(pEncCtx, pDstLuma, &pWelsMd->sMe.sMe16x16, &sMeRefine, 16, 16);			
+		UpdateP16x16MotionInfo(pMbCache, pCurMb, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x16.sMv);
+		
+		pMbCache->sMbMvp[0] = pWelsMd->sMe.sMe16x16.sMvp;
+		//save the best cost of final mode
+		iBestSadCost  = pWelsMd->sMe.sMe16x16.uiSadCost;
+		iBestSatdCost = pWelsMd->sMe.sMe16x16.uiSatdCost;
+		
+		//chroma
+		pMv = &pWelsMd->sMe.sMe16x16.sMv;
+		iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+		pTmpRefCb = pRefCb + iMvStride;
+		pTmpRefCr = pRefCr + iMvStride;
+		pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCb, iLineSizeRefUV, pDstCb, 8, *pMv, 8, 8);//Cb
+		pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCr, iLineSizeRefUV, pDstCr, 8, *pMv, 8, 8);//Cr
+
+		pWelsMd->iCostSkipMb = pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_16x16](pMbCache->SPicData.pEncMb[0], pCurDqLayer->iEncStride[0], pDstLuma, 16 );
+		pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8](pMbCache->SPicData.pEncMb[1], pCurDqLayer->iEncStride[1], pDstCb, 8 );	
+		pWelsMd->iCostSkipMb += pEncCtx->pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8](pMbCache->SPicData.pEncMb[2], pCurDqLayer->iEncStride[2], pDstCr, 8 );	
+		break;
+		
+	case MB_TYPE_16x8:
+		iPixStride = 0;
+		for (i = 0; i < 2; i++)
+		{
+			//luma
+			iIdx = i<<3;
+			InitMeRefinePointer(&sMeRefine, pMbCache, iPixStride);
+			iPixStride += ME_REFINE_BUF_STRIDE_BLK8;
+			PredInter16x8Mv( pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMvp );
+			MeRefineFracPixel(pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe16x8[i], &sMeRefine, 16, 8);				
+			UpdateP16x8MotionInfo(pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe16x8[i].sMv);
+			pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe16x8[i].sMvp;
+			//save the best cost of final mode
+			iBestSadCost += pWelsMd->sMe.sMe16x8[i].uiSadCost;
+			iBestSatdCost += pWelsMd->sMe.sMe16x8[i].uiSatdCost;		
+			
+			//chroma
+			iRefBlk4Stride = (i << 2) * iLineSizeRefUV;
+			iDstBlk4Stride = i << 5; // 4*8
+			pMv = &pWelsMd->sMe.sMe16x8[i].sMv;
+			iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+			pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
+			pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
+			pTmpDstCb = pDstCb + iDstBlk4Stride;
+			pTmpDstCr = pDstCr + iDstBlk4Stride;
+			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 8, 4);//Cb
+			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 8, 4);//Cr	
+		}	
+		break;
+		
+	case MB_TYPE_8x16:		
+		iPixStride = 0;
+		for (i = 0; i < 2; i++)
+		{
+			//luma
+			iIdx = i<<2;
+			InitMeRefinePointer(&sMeRefine, pMbCache, iPixStride);
+			iPixStride += ME_REFINE_BUF_WIDTH_BLK8;
+			PredInter8x16Mv( pMbCache, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMvp );
+			MeRefineFracPixel(pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iIdx], &pWelsMd->sMe.sMe8x16[i], &sMeRefine, 8, 16);				
+			update_P8x16_motion_info(pMbCache, pCurMb, iIdx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x16[i].sMv);
+			pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x16[i].sMvp;
+			//save the best cost of final mode
+			iBestSadCost += pWelsMd->sMe.sMe8x16[i].uiSadCost;
+			iBestSatdCost += pWelsMd->sMe.sMe8x16[i].uiSatdCost;
+			
+			//chroma
+			iRefBlk4Stride = iIdx; //4
+			pMv = &pWelsMd->sMe.sMe8x16[i].sMv;
+			iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+			pTmpRefCb = pRefCb + iRefBlk4Stride + iMvStride;
+			pTmpRefCr = pRefCr + iRefBlk4Stride + iMvStride;
+			pTmpDstCb = pDstCb + iRefBlk4Stride;
+			pTmpDstCr = pDstCr + iRefBlk4Stride;
+			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCb, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 4, 8);//Cb
+			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCr, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 4, 8);//Cr
+		}
+		break;
+		
+	case MB_TYPE_8x8:
+		for (i = 0; i < 4; i++)
+		{
+			int32_t iBlk8Idx = i<<2; //0, 4, 8, 12
+			int32_t	iBlk4X, iBlk4Y;
+			
+			pCurMb->pRefIndex[i] = pWelsMd->uiRef;
+			
+			//luma
+			InitMeRefinePointer(&sMeRefine, pMbCache, g_kiPixStrideIdx8x8[i]);
+			PredMv( &pMbCache->sMvComponents, iBlk8Idx, 2, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMvp );
+			MeRefineFracPixel(pEncCtx, pDstLuma + g_kuiSmb4AddrIn256[iBlk8Idx], &pWelsMd->sMe.sMe8x8[i], &sMeRefine, 8, 8);					
+			UpdateP8x8MotionInfo(pMbCache, pCurMb, iBlk8Idx, pWelsMd->uiRef, &pWelsMd->sMe.sMe8x8[i].sMv);
+			pMbCache->sMbMvp[i] = pWelsMd->sMe.sMe8x8[i].sMvp;
+			iBestSadCost += pWelsMd->sMe.sMe8x8[i].uiSadCost;
+			iBestSatdCost += pWelsMd->sMe.sMe8x8[i].uiSatdCost; 
+			
+			//chroma
+			pMv = &pWelsMd->sMe.sMe8x8[i].sMv;
+			iMvStride = (pMv->iMvY >> 3) * iLineSizeRefUV + (pMv->iMvX >> 3);
+			
+			iBlk4X = (i & 1) << 2;
+			iBlk4Y = (i >> 1) << 2;
+			iRefBlk4Stride = iBlk4Y * iLineSizeRefUV + iBlk4X;
+			iDstBlk4Stride = (iBlk4Y << 3) + iBlk4X;
+			
+			pTmpRefCb = pRefCb + iRefBlk4Stride;
+			pTmpDstCb = pDstCb + iDstBlk4Stride;
+			pTmpRefCr = pRefCr + iRefBlk4Stride;
+			pTmpDstCr = pDstCr + iDstBlk4Stride;
+			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCb+iMvStride, iLineSizeRefUV, pTmpDstCb, 8, *pMv, 4, 4);//Cb
+			pEncCtx->pFuncList->sMcFuncs.pfChromaMc(pTmpRefCr+iMvStride, iLineSizeRefUV, pTmpDstCr, 8, *pMv, 4, 4);//Cr		
+		
+		}		
+		break;
+	default:
+		break;
+	}
+	pCurMb->pSadCost[0] = iBestSadCost;
+    if(pWelsMd->bMdUsingSad)
+		pWelsMd->iCostLuma = iBestSadCost;
+    else
+		pWelsMd->iCostLuma = iBestSatdCost;
+
+}
+BOOL_T WelsMdFirstIntraMode(void* pEnc, void* pMd, SMB* pCurMb, SMbCache *pMbCache)
+{
+	sWelsEncCtx *pEncCtx	= (sWelsEncCtx*)pEnc;
+	SWelsFuncPtrList *pFunc	= pEncCtx->pFuncList;
+	SWelsMD *pWelsMd		= (SWelsMD*)pMd;
+    
+	int32_t iCostI16x16 = WelsMdI16x16(pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+	
+	//compare cost_p16x16 with cost_i16x16
+	if ( iCostI16x16 < pWelsMd->iCostLuma ) 
+	{
+		pCurMb->uiMbType = MB_TYPE_INTRA16x16;
+		pWelsMd->iCostLuma = iCostI16x16;
+		
+		pFunc->pfIntraFineMd( pEncCtx, pWelsMd, pCurMb, pMbCache );
+		
+		//add pEnc&rec to MD--2010.3.15
+		if ( IS_INTRA16x16(pCurMb->uiMbType) )
+		{
+			pCurMb->uiCbp = 0;
+			WelsEncRecI16x16Y( pEncCtx, pCurMb, pMbCache );
+		}
+		
+		//chroma				
+		pWelsMd->iCostChroma = WelsMdIntraChroma(pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+		WelsIMbChromaEncode( pEncCtx, pCurMb, pMbCache ); //add pEnc&rec to MD--2010.3.15
+		
+		pCurMb->pSadCost[0] = 0;
+		return TRUE; //intra_mb_type is best
+	}
+
+	return FALSE;
+}
+
+void WelsMdInterMb(void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb )
+{
+	sWelsEncCtx *pEncCtx	= (sWelsEncCtx*)pEnc;
+	SWelsMD* pWelsMd				= (SWelsMD*)pMd;
+	SDqLayer* pCurDqLayer			= pEncCtx->pCurDqLayer;
+	SMbCache *pMbCache			= &pSlice->sMbCacheInfo;
+	const uint32_t kuiNeighborAvail	= pCurMb->uiNeighborAvail;
+	const int32_t kiMbWidth			= pCurDqLayer->iMbWidth;
+	const  SMB* top_mb				= pCurMb-kiMbWidth;
+	const bool_t bMbLeftAvailPskip	= ((kuiNeighborAvail&LEFT_MB_POS) ? IS_SKIP((pCurMb-1)->uiMbType) : false );
+	const bool_t bMbTopAvailPskip		= ((kuiNeighborAvail&TOP_MB_POS) ? IS_SKIP(top_mb->uiMbType) : false );
+	const bool_t bMbTopLeftAvailPskip	= ((kuiNeighborAvail&TOPLEFT_MB_POS) ? IS_SKIP((top_mb -1)->uiMbType) : false );
+	const bool_t bMbTopRightAvailPskip= ((kuiNeighborAvail&TOPRIGHT_MB_POS) ? IS_SKIP((top_mb +1)->uiMbType) : false );
+	BOOL_T bTrySkip = bMbLeftAvailPskip || bMbTopAvailPskip || bMbTopLeftAvailPskip || bMbTopRightAvailPskip;
+	BOOL_T bKeepSkip = bMbLeftAvailPskip && bMbTopAvailPskip && bMbTopRightAvailPskip;
+	BOOL_T bSkip = FALSE;
+
+	if ( pEncCtx->pFuncList->pfInterMdBackgroundDecision( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, &bKeepSkip ) )
+	{
+		return;
+	}
+
+	//step 1: try SKIP
+	bSkip = WelsMdInterJudgePskip( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bTrySkip ); 
+
+	if ( bSkip )
+	{
+		if(bKeepSkip)
+		{
+			WelsMdInterDecidedPskip(pEncCtx,  pSlice,  pCurMb, pMbCache);
+			return;
+		}	
+	}
+	else
+	{		
+		PredictSad( pMbCache->sMvComponents.iRefIndexCache, pMbCache->iSadCost, 0, &pWelsMd->iSadPredMb );
+
+		//step 2: P_16x16
+		pWelsMd->iCostLuma = WelsMdP16x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, pCurMb);
+		pCurMb->uiMbType = MB_TYPE_16x16;
+	}
+		
+	WelsMdInterSecondaryModesEnc( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bSkip );
+}
+
+
+
+//////
+//  try the ordinary Pskip
+//////
+bool_t WelsMdInterJudgePskip( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T bTrySkip )
+{
+	bool_t bRet = true;
+	if ( (( pEncCtx->pRefPic->iPictureType == P_SLICE ) && (pMbCache->uiRefMbType == MB_TYPE_SKIP || pMbCache->uiRefMbType == MB_TYPE_BACKGROUND) ) ||
+		bTrySkip )
+	{
+		PredictSadSkip( pMbCache->sMvComponents.iRefIndexCache, pMbCache->bMbTypeSkip, pMbCache->iSadCostSkip, 0, &(pWelsMd->iSadPredSkip) );	
+		bRet = WelsMdPSkipEnc(pEncCtx, pWelsMd, pCurMb, pMbCache)? true:false;
+		return bRet; 
+	}
+
+	return false;
+}
+
+//////
+//  try the ordinary Pskip
+//////
+void WelsMdInterUpdatePskip( SDqLayer* pCurDqLayer, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache )
+{
+	//add pEnc&rec to MD--2010.3.15
+	pCurMb->uiCbp = 0;
+	pCurMb->uiLumaQp   = pSlice->uiLastMbQp;
+	pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
+	pMbCache->bCollocatedPredFlag = (LD32(&pCurMb->sMv[0]) == 0);
+}
+
+
+//////
+//  doublecheck if current MBTYPE is Pskip
+//////
+void WelsMdInterDoubleCheckPskip( SMB* pCurMb, SMbCache *pMbCache )
+{
+	if ( MB_TYPE_16x16 == pCurMb->uiMbType && 0 == pCurMb->uiCbp )
+	{
+		if ( 0 == pCurMb->pRefIndex[0] )
+		{
+			SMVUnitXY sMvp = { 0 };
+
+			PredSkipMv( pMbCache, &sMvp );
+			if ( LD32(&sMvp) == LD32(&pCurMb->sMv[0]) )
+			{
+				pCurMb->uiMbType = MB_TYPE_SKIP;
+			}			
+		}
+		pMbCache->bCollocatedPredFlag = (LD32(&pCurMb->sMv[0]) == 0);
+	}
+}
+
+//////
+//  Pskip mb encode
+//////
+void WelsMdInterDecidedPskip( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache )
+{
+	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+	pCurMb->uiMbType = MB_TYPE_SKIP;
+	WelsRecPskip( pCurDqLayer, pEncCtx->pFuncList, pCurMb, pMbCache );
+	WelsMdInterUpdatePskip( pCurDqLayer, pSlice, pCurMb, pMbCache );
+}
+
+//////
+//  inter mb encode
+//////
+void WelsMdInterEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache )
+{
+	SWelsFuncPtrList *pFunc= pEncCtx->pFuncList;
+	SDqLayer* pCurDqLayer	= pEncCtx->pCurDqLayer;
+
+	//add pEnc&rec to MD--2010.3.15
+	const int32_t kiCsStrideY = pCurDqLayer->iCsStride[0];
+	const int32_t kiCsStrideUV = pCurDqLayer->iCsStride[1];
+
+	//add pEnc&rec to MD--2010.3.15
+	pCurMb->uiCbp = 0;
+	WelsInterMbEncode( pEncCtx, pSlice, pCurMb );
+	WelsPMbChromaEncode( pEncCtx, pSlice, pCurMb );
+
+	pFunc->pfCopy16x16Aligned( pMbCache->SPicData.pCsMb[0], kiCsStrideY, pMbCache->pMemPredLuma,      16 );
+	pFunc->pfCopy8x8Aligned(   pMbCache->SPicData.pCsMb[1], kiCsStrideUV, pMbCache->pMemPredChroma,    8 );
+	pFunc->pfCopy8x8Aligned(   pMbCache->SPicData.pCsMb[2], kiCsStrideUV, pMbCache->pMemPredChroma+64, 8 );
+}
+
+
+
+
+//////
+//  try the BGD Pskip
+//////
+bool_t WelsMdInterJudgeBGDPskip( void* pCtx, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* bKeepSkip )
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	SWelsMD* pWelsMd = (SWelsMD*)pMd;
+
+	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+
+	const int32_t kiRefMbQp = pCurDqLayer->pRefPic->pRefMbQp[pCurMb->iMbXY];
+	const int32_t kiCurMbQp = pCurMb->uiLumaQp;// unsigned -> signed
+	int8_t	*pVaaBgMbFlag = pEncCtx->pVaa->pVaaBackgroundMbFlag+pCurMb->iMbXY;
+
+	const int32_t kiMbWidth = pCurDqLayer->iMbWidth;
+
+		*bKeepSkip = (*bKeepSkip) && 
+			((!pVaaBgMbFlag[-1])&&
+			(!pVaaBgMbFlag[-kiMbWidth])&&
+			(!pVaaBgMbFlag[-kiMbWidth+1]));
+		
+		if (
+			*pVaaBgMbFlag
+			&& !IS_INTRA(pMbCache->uiRefMbType)
+			&& (kiRefMbQp - kiCurMbQp <= DELTA_QP_BGD_THD || kiRefMbQp<=26)
+			)
+		{
+			SMVUnitXY	sVaaPredSkipMv = { 0 };
+			PredSkipMv( pMbCache, &sVaaPredSkipMv );
+			WelsMdBackgroundMbEnc( pEncCtx, pWelsMd, pCurMb, pMbCache, pSlice, (LD32(&sVaaPredSkipMv) == 0) );
+			return true;
+		}
+
+	return false;
+}
+
+bool_t WelsMdInterJudgeBGDPskipFalse( void* pCtx, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, BOOL_T* bKeepSkip )
+{
+	return false;
+}
+
+
+
+//////
+//  update BGD related info
+//////
+void WelsMdInterUpdateBGDInfo( SDqLayer* pCurLayer,  SMB* pCurMb, const bool_t bCollocatedPredFlag, const int32_t iRefPictureType )
+{ 
+	uint8_t* pTargetRefMbQpList = (pCurLayer->pDecPic->pRefMbQp);
+	const int32_t kiMbXY = pCurMb->iMbXY;
+
+	if (pCurMb->uiCbp || I_SLICE == iRefPictureType || 0 == bCollocatedPredFlag )
+	{
+		pTargetRefMbQpList[kiMbXY] = pCurMb->uiLumaQp;
+	}
+	else //unchange, do not need to evaluation?
+	{
+		uint8_t* pRefPicRefMbQpList = (pCurLayer->pRefPic->pRefMbQp);
+		pTargetRefMbQpList[kiMbXY] = pRefPicRefMbQpList[kiMbXY];
+	}
+
+	if (pCurMb->uiMbType==MB_TYPE_BACKGROUND)
+	{
+		pCurMb->uiMbType = MB_TYPE_SKIP;
+	}
+}
+
+void WelsMdInterUpdateBGDInfoNULL( SDqLayer* pCurLayer, SMB* pCurMb, const bool_t bCollocatedPredFlag, const int32_t iRefPictureType )
+{
+}
+
+//
+//
+//
+void WelsMdInterSaveSadAndRefMbType(Mb_Type* pRefMbtypeList, SMbCache * pMbCache, const SMB*  pCurMb, const SWelsMD* pMd)
+{
+	const Mb_Type kmtCurMbtype = pCurMb->uiMbType;
+
+	//sad
+	pMbCache->pEncSad[0] =  (kmtCurMbtype == MB_TYPE_SKIP) ? pMd->iCostSkipMb:0;
+	//uiMbType
+	pRefMbtypeList[pCurMb->iMbXY] = kmtCurMbtype;
+}
+
+void WelsMdInterSecondaryModesEnc(sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache, const BOOL_T bSkip )
+{
+		//step 2: Intra	
+		const BOOL_T kbTrySkip = pEncCtx->pFuncList->pfFirstIntraMode(pEncCtx, pWelsMd, pCurMb, pMbCache);
+        if(kbTrySkip)
+            return;
+
+		if (bSkip)
+		{
+			WelsMdInterDecidedPskip(pEncCtx,  pSlice,  pCurMb, pMbCache);
+		}
+		else
+		{
+			//Step 2: ILFMD in P
+			pEncCtx->pFuncList->pfInterFineMd(pEncCtx, pWelsMd, pSlice, pCurMb, pWelsMd->iCostLuma);
+
+			//refinement for inter type
+			WelsMdInterMbRefinement( pEncCtx, pWelsMd, pCurMb, pMbCache );
+
+			//step 7: invoke encoding
+			WelsMdInterEncode( pEncCtx, pSlice, pCurMb, pMbCache );
+
+			//step 8: double check Pskip
+			WelsMdInterDoubleCheckPskip(pCurMb, pMbCache);
+		}	
+}
+
+
+void WelsMdIntraSecondaryModesEnc( sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SMB* pCurMb, SMbCache *pMbCache )
+{
+	SWelsFuncPtrList *pFunc = pEncCtx->pFuncList;
+	//initial prediction memory for I_4x4
+	pFunc->pfIntraFineMd( pEncCtx, pWelsMd, pCurMb, pMbCache );			//WelsMdIntraFinePartitionVaa
+
+	//add pEnc&rec to MD--2010.3.15
+	if ( IS_INTRA16x16(pCurMb->uiMbType) )
+	{
+		pCurMb->uiCbp = 0;
+		WelsEncRecI16x16Y( pEncCtx, pCurMb, pMbCache );
+	}
+
+	//chroma
+	pWelsMd->iCostChroma = WelsMdIntraChroma(pFunc, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);	
+	WelsIMbChromaEncode( pEncCtx, pCurMb, pMbCache ); //add pEnc&rec to MD--2010.3.15
+	pCurMb->pSadCost[0] = 0;	
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/svc_enc_slice_segment.cpp
@@ -1,0 +1,768 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	slice_segment.c
+ *
+ * \brief	SSlice segment routine (Single slice/multiple slice/fmo arrangement exclusive)
+ *
+ * \date	2/4/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <assert.h>
+#include "svc_enc_slice_segment.h"
+#include "wels_const.h"
+#include "macros.h"
+#include "utils.h"
+#include "macros.h"
+#include "rc.h"
+
+namespace WelsSVCEnc {
+/*!
+ * \brief	Assign MB map for single slice segment
+ *
+ * \param	pMbMap			overall MB map
+ * \param	iCountMbNum	count number of MB
+ *
+ * \return	0 - successful; none 0 - failed
+ */
+int32_t AssignMbMapSingleSlice( void *pMbMap, const int32_t kiCountMbNum, const int32_t kiMapUnitSize )
+{
+	if ( NULL == pMbMap || kiCountMbNum <= 0 )
+		return 1;
+	
+	memset( pMbMap, 0, kiCountMbNum * kiMapUnitSize );
+
+	return 0;
+}
+
+/*!
+ * \brief	Assign MB map for multiple slice(s) segment
+ *
+ * \param	pMbMap			overall MB map
+ * \param	iCountMbNum	count number of MB
+ *
+ * \return	0 - successful; none 0 - failed
+ */
+int32_t AssignMbMapMultipleSlices( SSliceCtx *pSliceSeg, const SMulSliceOption *kpMso )
+{
+	if ( NULL == pSliceSeg || SM_SINGLE_SLICE == pSliceSeg->uiSliceMode )
+		return 1;
+	
+	if ( SM_ROWMB_SLICE == pSliceSeg->uiSliceMode )
+	{
+		const int32_t kiMbWidth	= pSliceSeg->iMbWidth;
+		int32_t iSliceNum = pSliceSeg->iSliceNumInFrame, uiSliceIdx = 0;
+		
+		while ( uiSliceIdx < iSliceNum )
+		{
+			const int16_t kiFirstMb = uiSliceIdx * kiMbWidth;
+			pSliceSeg->pCountMbNumInSlice[uiSliceIdx]	= kiMbWidth;
+			pSliceSeg->pFirstMbInSlice[uiSliceIdx]		= kiFirstMb;
+			memset(pSliceSeg->pOverallMbMap+kiFirstMb, (uint8_t)uiSliceIdx, kiMbWidth*sizeof(uint8_t));
+			++ uiSliceIdx;
+		}
+
+		return 0;
+	}
+	else if (	SM_RASTER_SLICE  == pSliceSeg->uiSliceMode ||
+				SM_FIXEDSLCNUM_SLICE == pSliceSeg->uiSliceMode )
+	{
+		const int32_t *kpSlicesAssignList				= (int32_t *)&(kpMso->sSliceArgument.uiSliceMbNum[0]);
+		const int32_t kiCountNumMbInFrame		= pSliceSeg->iMbNumInFrame;
+		const int32_t kiCountSliceNumInFrame	= pSliceSeg->iSliceNumInFrame;
+		int32_t iSliceIdx						= 0;
+		int16_t iMbIdx							= 0;
+		
+		do {
+			const int32_t kiCurRunLength	= kpSlicesAssignList[iSliceIdx];
+			int32_t iRunIdx					= 0;
+
+			pSliceSeg->pFirstMbInSlice[iSliceIdx]			= iMbIdx;
+			pSliceSeg->pCountMbNumInSlice[iSliceIdx]		= kiCurRunLength;
+
+			// due here need check validate mb_assign_map for input pData, can not use memset			
+			do {
+				pSliceSeg->pOverallMbMap[iMbIdx+iRunIdx]	= iSliceIdx;
+				++ iRunIdx;
+			} while(iRunIdx < kiCurRunLength && iMbIdx + iRunIdx < kiCountNumMbInFrame);			
+
+			iMbIdx += kiCurRunLength;
+			++ iSliceIdx;
+		} while(iSliceIdx < kiCountSliceNumInFrame && iMbIdx < kiCountNumMbInFrame);		
+	}
+	else if ( SM_DYN_SLICE == pSliceSeg->uiSliceMode )
+	{
+	}
+	else	// any else uiSliceMode?
+	{
+		assert( 0 );
+	}
+
+	// extention for other multiple slice type in the future
+	return 1;
+}
+
+/*!
+ *  Check slices assignment setttings on MST_INTERLEAVE type	
+ */
+
+//slice parameter check for SM_FIXEDSLCNUM_SLICE 
+bool_t CheckFixedSliceNumMultiSliceSetting( const int32_t kiMbNumInFrame, SSliceArgument * pSliceArg )
+{
+	int32_t *pSlicesAssignList		= (int32_t *)&(pSliceArg->uiSliceMbNum[0]);
+	const uint32_t kuiSliceNum			= pSliceArg->iSliceNum;
+	uint32_t uiSliceIdx				= 0;
+	const int32_t kiMbNumPerSlice	= kiMbNumInFrame / kuiSliceNum;	
+	int32_t iNumMbLeft				= kiMbNumInFrame;		
+
+	if ( NULL == pSlicesAssignList )
+		return false;
+
+	for ( ; uiSliceIdx+1 < kuiSliceNum; ++ uiSliceIdx )
+	{
+		pSlicesAssignList[uiSliceIdx] = kiMbNumPerSlice;
+		iNumMbLeft	-= kiMbNumPerSlice;
+	}
+	pSlicesAssignList[uiSliceIdx] = iNumMbLeft;
+
+	return true;
+}
+
+//slice parameter check for SM_ROWMB_SLICE 
+bool_t CheckRowMbMultiSliceSetting( const int32_t kiMbWidth, SSliceArgument * pSliceArg )
+{
+	int32_t *pSlicesAssignList = (int32_t *)&(pSliceArg->uiSliceMbNum[0]);
+	const uint32_t kuiSliceNum		= pSliceArg->iSliceNum;
+	uint32_t uiSliceIdx			= 0;
+
+	if ( NULL == pSlicesAssignList )
+		return false;
+
+	while ( uiSliceIdx < kuiSliceNum )
+	{
+		pSlicesAssignList[uiSliceIdx]	= kiMbWidth;
+		++ uiSliceIdx;
+	}
+	return true;
+}
+
+//slice parameter check for SM_RASTER_SLICE 
+bool_t CheckRasterMultiSliceSetting( const int32_t kiMbNumInFrame, SSliceArgument * pSliceArg )
+{
+	int32_t			*pSlicesAssignList = (int32_t *)&(pSliceArg->uiSliceMbNum[0]);
+	int32_t			iActualSliceCount	= 0;
+
+	//check mb_num setting
+	uint32_t uiSliceIdx			= 0;
+	int32_t iCountMb			= 0;
+
+	if ( NULL == pSlicesAssignList )
+		return false;
+
+	while ( ( uiSliceIdx < MAX_SLICES_NUM ) && ( 0 < pSlicesAssignList[uiSliceIdx] ) ) 
+	{
+		iCountMb			+= pSlicesAssignList[uiSliceIdx];
+		iActualSliceCount	=  uiSliceIdx + 1;
+
+		if ( iCountMb >= kiMbNumInFrame )
+		{
+			break;
+		}
+
+		++ uiSliceIdx;
+	}
+	//break condition above makes, after the while
+	// here must have (iActualSliceCount <= MAX_SLICES_NUM)
+
+	//correction if needed
+	if ( iCountMb == kiMbNumInFrame )
+	{
+		;
+	}
+	else if ( iCountMb > kiMbNumInFrame )
+	{
+		//need correction: 
+		//setting is more than iMbNumInFrame, 
+		//cut the last uiSliceMbNum; adjust iCountMb
+		pSlicesAssignList[iActualSliceCount-1]	-=	( iCountMb - kiMbNumInFrame );
+		iCountMb								=	kiMbNumInFrame;
+	}
+	else if ( iActualSliceCount < MAX_SLICES_NUM )
+	{
+		//where ( iCountMb < iMbNumInFrame )
+		//can do correction: 
+		//	make the last uiSliceMbNum the left num
+		pSlicesAssignList[iActualSliceCount] = kiMbNumInFrame - iCountMb;	
+		iActualSliceCount += 1;
+	}
+	else
+	{
+		//here ( iCountMb < iMbNumInFrame ) && ( iActualSliceCount == MAX_SLICES_NUM )
+		//no more slice can be added
+		return false;
+	}
+
+	pSliceArg->iSliceNum = iActualSliceCount;
+	return true;
+
+}
+
+
+// GOM based RC related for uiSliceNum decision, only used at SM_FIXEDSLCNUM_SLICE
+void GomValidCheckSliceNum( const int32_t kiMbWidth, const int32_t kiMbHeight, int32_t *pSliceNum )
+{
+	const int32_t kiCountNumMb	= kiMbWidth * kiMbHeight;	
+	int32_t iSliceNum			= *pSliceNum;
+	int32_t iGomSize;
+	
+	//The default RC is Bit-rate mode[Yi], but need consider as below:
+	// Tuned to use max of mode0 and mode1 due can not refresh on this from rc mode changed outside, 8/16/2011
+	// NOTE: GOM_ROW_MODE0_?P is integer multipler of GOM_ROW_MODE1_?P, which predefined at rc.h there, so GOM_ROM take MODE0 as the initial	
+	if( kiMbWidth<=MB_WIDTH_THRESHOLD_90P )
+		iGomSize = kiMbWidth * GOM_ROW_MODE0_90P;
+	else if( kiMbWidth<=MB_WIDTH_THRESHOLD_180P )
+		iGomSize = kiMbWidth *  GOM_ROW_MODE0_180P;
+	else if( kiMbWidth<=MB_WIDTH_THRESHOLD_360P )
+		iGomSize = kiMbWidth * GOM_ROW_MODE0_360P;
+	else
+		iGomSize = kiMbWidth * GOM_ROW_MODE0_720P;
+
+	while(true)
+	{
+		if ( kiCountNumMb < iGomSize * iSliceNum )
+		{
+			-- iSliceNum;
+			iSliceNum = iSliceNum - (iSliceNum & 0x01);	// verfiy even num for multiple slices case			
+			if ( iSliceNum < 2 )	// for safe
+				break;
+			continue;
+		}		
+		break;		
+	}
+	
+	if ( 0 == iSliceNum )
+		iSliceNum = 1;
+	
+	*pSliceNum	= iSliceNum;
+}
+
+
+// GOM based RC related for uiSliceMbNum decision, only used at SM_FIXEDSLCNUM_SLICE
+void GomValidCheckSliceMbNum( const int32_t kiMbWidth, const int32_t kiMbHeight, SSliceArgument * pSliceArg )
+{
+	uint32_t *pSlicesAssignList		= &(pSliceArg->uiSliceMbNum[0]);
+	const uint32_t kuiSliceNum			= pSliceArg->iSliceNum;
+	const int32_t kiMbNumInFrame	= kiMbWidth * kiMbHeight;			
+	const int32_t kiMbNumPerSlice	= kiMbNumInFrame / kuiSliceNum;	
+	int32_t iNumMbLeft				= kiMbNumInFrame;			
+
+	int32_t iMinimalMbNum			= kiMbWidth;	// in theory we need only 1 SMB, here let it as one SMB row required
+	int32_t iMaximalMbNum			= 0;	// dynamically assign later
+	int32_t iGomSize;
+
+	uint32_t uiSliceIdx	= 0;	// for test
+
+	// The default RC is Bit-rate mode [Yi], but need consider as below:
+	// Tuned to use max of mode0 and mode1 due can not refresh on this from rc mode changed outside, 8/16/2011
+	// NOTE: GOM_ROW_MODE0_?P is integer multipler of GOM_ROW_MODE1_?P, which predefined at rc.h there, so GOM_ROM take MODE0 as the initial	
+	if( kiMbWidth<=MB_WIDTH_THRESHOLD_90P )
+		iGomSize = kiMbWidth * GOM_ROW_MODE0_90P;
+	else if( kiMbWidth<=MB_WIDTH_THRESHOLD_180P )
+		iGomSize = kiMbWidth * GOM_ROW_MODE0_180P;
+	else if( kiMbWidth<=MB_WIDTH_THRESHOLD_360P )
+		iGomSize = kiMbWidth * GOM_ROW_MODE0_360P;
+	else
+		iGomSize = kiMbWidth * GOM_ROW_MODE0_720P;
+
+	iMinimalMbNum	= iGomSize;
+	iMaximalMbNum	= kiMbNumInFrame - (kuiSliceNum - 1) * iMinimalMbNum;
+
+	while ( uiSliceIdx+1 < kuiSliceNum )
+	{
+		// GOM boundary aligned
+		int32_t iNumMbAssigning = (int32_t)(1.0f * kiMbNumPerSlice / iGomSize + 0.5f + EPSN) * iGomSize;
+
+		// make sure one GOM at least in each slice for safe
+		if ( iNumMbAssigning < iMinimalMbNum )
+			iNumMbAssigning	= iMinimalMbNum;
+		else if ( iNumMbAssigning > iMaximalMbNum )
+			iNumMbAssigning	= iMaximalMbNum;
+
+		assert( iNumMbAssigning > 0 );
+
+		iNumMbLeft -= iNumMbAssigning;
+		assert( iNumMbLeft > 0 );
+		pSlicesAssignList[uiSliceIdx]	= iNumMbAssigning;
+
+		++ uiSliceIdx;
+		iMaximalMbNum	= iNumMbLeft - (kuiSliceNum - uiSliceIdx - 1) * iMinimalMbNum;	// get maximal num_mb in left parts
+	}
+	pSlicesAssignList[uiSliceIdx] = iNumMbLeft;		
+}
+
+
+/*!
+ *	Get slice count for multiple slice segment
+ *
+ */
+int32_t GetInitialSliceNum( const int32_t kiMbWidth, const int32_t kiMbHeight, SMulSliceOption* pMso )
+{
+	if ( NULL == pMso )
+		return -1;
+
+	switch( pMso->uiSliceMode )
+	{
+	case SM_SINGLE_SLICE:
+	case SM_FIXEDSLCNUM_SLICE:
+	case SM_RASTER_SLICE:
+	case SM_ROWMB_SLICE:
+		{
+			return pMso->sSliceArgument.iSliceNum;
+		}
+	case SM_DYN_SLICE:
+		{
+			return AVERSLICENUM_CONSTRAINT;//at the beginning of dynamic slicing, set the uiSliceNum to be 1
+		}
+	case SM_RESERVED:
+	default:
+		{
+			return -1;
+		}
+	}
+
+	return -1;
+}
+
+/*!
+ * \brief	Initialize slice segment (Single/multiple slices)
+ *
+ * \param	pSliceSeg			SSlice segment to be initialized
+ * \param	uiSliceMode			SSlice mode
+ * \param	multi_slice_argv	Multiple slices argument
+ * \param	iMbWidth			MB width 
+ * \param	iMbHeight			MB height
+ *
+ * \return	0 - successful; none 0 - failed;
+ */
+int32_t InitSliceSegment(	SSliceCtx *pSliceSeg,
+						    CMemoryAlign *pMa,
+							SMulSliceOption *pMso,
+							const int32_t kiMbWidth,
+							const int32_t kiMbHeight )
+{
+	const int32_t kiCountMbNum = kiMbWidth * kiMbHeight;
+	 SliceMode uiSliceMode = SM_SINGLE_SLICE;
+
+	if ( NULL == pSliceSeg || NULL == pMso || kiMbWidth == 0 || kiMbHeight == 0 )
+		return 1;
+
+	uiSliceMode = pMso->uiSliceMode;
+	if ( pSliceSeg->iMbNumInFrame == kiCountMbNum && pSliceSeg->iMbWidth == kiMbWidth
+			&& pSliceSeg->iMbHeight == kiMbHeight && pSliceSeg->uiSliceMode == uiSliceMode && pSliceSeg->pOverallMbMap != NULL )
+			return 0;
+	else if ( pSliceSeg->iMbNumInFrame != kiCountMbNum )
+	{
+		if ( NULL != pSliceSeg->pOverallMbMap )
+		{
+			pMa->WelsFree( pSliceSeg->pOverallMbMap, "pSliceSeg->pOverallMbMap" );
+
+			pSliceSeg->pOverallMbMap = NULL;
+		}
+		if ( NULL != pSliceSeg->pFirstMbInSlice )
+		{
+			pMa->WelsFree( pSliceSeg->pFirstMbInSlice, "pSliceSeg->pFirstMbInSlice" );
+
+			pSliceSeg->pFirstMbInSlice = NULL;
+		}
+		if ( NULL != pSliceSeg->pCountMbNumInSlice )
+		{
+			pMa->WelsFree( pSliceSeg->pCountMbNumInSlice, "pSliceSeg->pCountMbNumInSlice" );
+
+			pSliceSeg->pCountMbNumInSlice	= NULL;
+		}
+		// just for safe
+		pSliceSeg->iSliceNumInFrame	= 0;
+		pSliceSeg->iMbNumInFrame		= 0;
+		pSliceSeg->iMbWidth				= 0;
+		pSliceSeg->iMbHeight			= 0;
+		pSliceSeg->uiSliceMode			= SM_SINGLE_SLICE;	// sigle in default
+	}
+
+	if ( SM_SINGLE_SLICE == uiSliceMode )
+	{
+		pSliceSeg->pOverallMbMap	= (uint8_t *)pMa->WelsMalloc(kiCountMbNum * sizeof(uint8_t), "pSliceSeg->pOverallMbMap" );
+
+		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pOverallMbMap )		
+		pSliceSeg->iSliceNumInFrame	= 1;
+
+		pSliceSeg->pFirstMbInSlice	= (int16_t *)pMa->WelsMalloc( pSliceSeg->iSliceNumInFrame * sizeof(int16_t), "pSliceSeg->pFirstMbInSlice" );
+
+		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pFirstMbInSlice )
+
+		pSliceSeg->pCountMbNumInSlice= (int32_t *)pMa->WelsMalloc( pSliceSeg->iSliceNumInFrame * sizeof(int32_t), "pSliceSeg->pCountMbNumInSlice" );
+
+		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pCountMbNumInSlice )
+		pSliceSeg->uiSliceMode			= uiSliceMode;
+		pSliceSeg->iMbWidth				= kiMbWidth;
+		pSliceSeg->iMbHeight			= kiMbHeight;
+		pSliceSeg->iMbNumInFrame		= kiCountMbNum;
+		pSliceSeg->pCountMbNumInSlice[0]	= kiCountMbNum;
+		pSliceSeg->pFirstMbInSlice[0]		= 0;
+
+		return AssignMbMapSingleSlice( pSliceSeg->pOverallMbMap, kiCountMbNum, sizeof(pSliceSeg->pOverallMbMap[0]) );
+	}
+	else //if ( SM_MULTIPLE_SLICE == uiSliceMode )
+	{
+		if ( uiSliceMode != SM_FIXEDSLCNUM_SLICE && uiSliceMode != SM_ROWMB_SLICE && uiSliceMode != SM_RASTER_SLICE && uiSliceMode != SM_DYN_SLICE )
+			return 1;
+
+		pSliceSeg->pOverallMbMap	= (uint8_t *)pMa->WelsMalloc( kiCountMbNum * sizeof(uint8_t), "pSliceSeg->pOverallMbMap" );
+
+		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pOverallMbMap )
+
+		//SM_DYN_SLICE: init, set pSliceSeg->iSliceNumInFrame	= 1;		
+		pSliceSeg->iSliceNumInFrame = GetInitialSliceNum( kiMbWidth, kiMbHeight, pMso );
+
+		if ( -1 == pSliceSeg->iSliceNumInFrame )
+			return 1;
+
+		pSliceSeg->pCountMbNumInSlice	= (int32_t *)pMa->WelsMalloc( pSliceSeg->iSliceNumInFrame * sizeof(int32_t), "pSliceSeg->pCountMbNumInSlice" );
+
+		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pCountMbNumInSlice )
+
+		pSliceSeg->pFirstMbInSlice		= (int16_t *)pMa->WelsMalloc( pSliceSeg->iSliceNumInFrame * sizeof(int16_t), "pSliceSeg->pFirstMbInSlice" );
+
+		WELS_VERIFY_RETURN_IF( 1, NULL == pSliceSeg->pFirstMbInSlice )
+		pSliceSeg->uiSliceMode			= pMso->uiSliceMode;
+		pSliceSeg->iMbWidth				= kiMbWidth;
+		pSliceSeg->iMbHeight			= kiMbHeight;
+		pSliceSeg->iMbNumInFrame		= kiCountMbNum;
+		if ( SM_DYN_SLICE == pMso->uiSliceMode )
+		{
+			if ( 0 < pMso->sSliceArgument.uiSliceSizeConstraint )
+			{
+				pSliceSeg->uiSliceSizeConstraint= pMso->sSliceArgument.uiSliceSizeConstraint;
+			}
+			else
+			{
+				return 1;
+			}
+		}
+		else
+		{
+			pSliceSeg->uiSliceSizeConstraint = DEFAULT_MAXPACKETSIZE_CONSTRAINT;
+		}
+		// about "iMaxSliceNumConstraint"
+		//only used in SM_DYN_SLICE mode so far,
+		//now follows NAL_UNIT_CONSTRAINT, (see definition)
+		//will be adjusted under MT if there is limitation on iLayerNum 
+		pSliceSeg->iMaxSliceNumConstraint = MAX_SLICES_NUM;
+		
+
+		return AssignMbMapMultipleSlices( pSliceSeg, pMso );
+	}
+	return 0;
+}
+
+/*!
+ * \brief	Uninitialize slice segment (Single/multiple slices)
+ *
+ * \param	pSliceSeg			SSlice segment to be uninitialized
+ *
+ * \return	none;
+ */
+void UninitSliceSegment( SSliceCtx *pSliceSeg, CMemoryAlign *pMa )
+{
+	if ( NULL != pSliceSeg )
+	{
+		if ( NULL != pSliceSeg->pOverallMbMap )
+		{
+			pMa->WelsFree( pSliceSeg->pOverallMbMap, "pSliceSeg->pOverallMbMap" );
+
+			pSliceSeg->pOverallMbMap = NULL;
+		}
+		if ( NULL != pSliceSeg->pFirstMbInSlice )
+		{
+			pMa->WelsFree( pSliceSeg->pFirstMbInSlice, "pSliceSeg->pFirstMbInSlice" );
+
+			pSliceSeg->pFirstMbInSlice = NULL;
+		}
+		if ( NULL != pSliceSeg->pCountMbNumInSlice )
+		{
+			pMa->WelsFree( pSliceSeg->pCountMbNumInSlice, "pSliceSeg->pCountMbNumInSlice" );
+
+			pSliceSeg->pCountMbNumInSlice = NULL;
+		}		
+
+		pSliceSeg->iMbNumInFrame		= 0;
+		pSliceSeg->iMbWidth				= 0;
+		pSliceSeg->iMbHeight			= 0;
+		pSliceSeg->uiSliceMode			= SM_SINGLE_SLICE;	// single in default
+		pSliceSeg->iSliceNumInFrame	= 0;
+	}
+}
+
+
+/*!
+ * \brief	Initialize Wels SSlice context (Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context to be initialized
+ * \param	bFmoUseFlag	flag of using fmo
+ * \param	iMbWidth		MB width 
+ * \param	iMbHeight		MB height
+ * \param	uiSliceMode		slice mode
+ * \param	mul_slice_arg	argument for multiple slice if it is applicable
+ * \param	pPpsArg			argument for pPps parameter
+ *
+ * \return	0 - successful; none 0 - failed;
+ */
+int32_t InitSlicePEncCtx( SSliceCtx *pSliceCtx,
+						    CMemoryAlign *pMa,
+						    bool_t bFmoUseFlag,
+							int32_t iMbWidth,
+							int32_t iMbHeight,
+							SMulSliceOption *pMso,
+							void *pPpsArg )
+{
+	if ( NULL == pSliceCtx)
+		return 1;
+	
+	InitSliceSegment(	pSliceCtx,
+						pMa,
+						pMso,
+						iMbWidth,
+						iMbHeight	);
+	return 0;
+}
+
+/*!
+ * \brief	Uninitialize Wels SSlice context (Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context to be initialized 
+ *
+ * \return	NONE;
+ */
+void UninitSlicePEncCtx( SSliceCtx *pSliceCtx, CMemoryAlign *pMa )
+{
+	if ( NULL != pSliceCtx )
+	{
+		UninitSliceSegment( pSliceCtx, pMa );
+	}
+}
+
+/*!
+ * \brief	Get slice idc for given iMbXY (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	uiSliceIdc - successful; -1 - failed;
+ */
+uint8_t WelsMbToSliceIdc( SSliceCtx *pSliceCtx, const int16_t kiMbXY )
+{
+	if ( NULL != pSliceCtx && kiMbXY < pSliceCtx->iMbNumInFrame && kiMbXY >= 0 )
+		return pSliceCtx->pOverallMbMap[ kiMbXY ];
+	return (uint8_t)(-1);
+}
+
+/*!
+ * \brief	Get first mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kuiSliceIdc		slice idc
+ *
+ * \return	iFirstMb - successful; -1 - failed;
+ */
+int32_t WelsGetFirstMbOfSlice( SSliceCtx *pSliceCtx, const int32_t kuiSliceIdc )
+{
+	return pSliceCtx->pFirstMbInSlice[ kuiSliceIdc ];
+}
+
+/*!
+ * \brief	Get successive mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	next_mb - successful; -1 - failed;
+ */
+int32_t WelsGetNextMbOfSlice( SSliceCtx *pSliceCtx, const int16_t kiMbXY )
+{
+	if ( NULL != pSliceCtx )
+	{
+		SSliceCtx *pSliceSeg = pSliceCtx;
+		if ( NULL == pSliceSeg || kiMbXY < 0 || kiMbXY >= pSliceSeg->iMbNumInFrame )
+			return -1;
+		if ( SM_SINGLE_SLICE == pSliceSeg->uiSliceMode )
+		{
+			int32_t iNextMbIdx = kiMbXY;
+			++ iNextMbIdx;
+			if ( iNextMbIdx >= pSliceSeg->iMbNumInFrame )
+				iNextMbIdx	= -1;
+			return iNextMbIdx;
+		}
+		else /*if ( SM_MULTIPLE_SLICE == pSliceSeg->uiSliceMode )*/
+		{
+			if ( SM_RESERVED != pSliceSeg->uiSliceMode )
+			{
+				int32_t iNextMbIdx = kiMbXY;
+				++ iNextMbIdx;
+				if ( iNextMbIdx < pSliceSeg->iMbNumInFrame && pSliceSeg->pOverallMbMap != NULL && pSliceSeg->pOverallMbMap[iNextMbIdx] == pSliceSeg->pOverallMbMap[ kiMbXY ] )
+					return iNextMbIdx;
+				return -1;
+			}
+			else
+				return -1;	// reserved here for other multiple slice type
+		}
+	}
+	else
+		return -1;
+}
+
+/*!
+ * \brief	Get previous mb to be processed in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kiMbXY			MB xy index
+ *
+ * \return	prev_mb - successful; -1 - failed;
+ */
+int32_t WelsGetPrevMbOfSlice( SSliceCtx *pSliceCtx, const int16_t kiMbXY )
+{
+	if ( NULL != pSliceCtx )
+	{
+		SSliceCtx *pSliceSeg = pSliceCtx;
+		if ( NULL == pSliceSeg || kiMbXY < 0 || kiMbXY >= pSliceSeg->iMbNumInFrame )
+			return -1;
+		if ( pSliceSeg->uiSliceMode == SM_SINGLE_SLICE )
+			return (-1+kiMbXY);
+		else/* if ( pSliceSeg->uiSliceMode == SM_MULTIPLE_SLICE )*/
+		{
+			if ( SM_RESERVED == pSliceSeg->uiSliceMode )
+			{
+				int32_t iPrevMbIdx = kiMbXY;
+				-- iPrevMbIdx;
+				if ( iPrevMbIdx >= 0 && iPrevMbIdx < pSliceSeg->iMbNumInFrame && NULL != pSliceSeg->pOverallMbMap
+					&& pSliceSeg->pOverallMbMap[ kiMbXY ] == pSliceSeg->pOverallMbMap[ iPrevMbIdx ] )
+					return iPrevMbIdx;
+				return -1;
+			}
+			else
+				return -1;
+		}
+	}
+	else
+		return -1;
+}
+
+/*!
+ * \brief	Get number of mb in slice/slice_group: uiSliceIdc (apply in Single/multiple slices and FMO)
+ *
+ * \param	pSliceCtx		SSlice context
+ * \param	kuiSliceIdc		slice/slice_group idc
+ *
+ * \return	count_num_of_mb - successful; -1 - failed;
+ */
+int32_t WelsGetNumMbInSlice( SSliceCtx *pSliceCtx, const int32_t kuiSliceIdc )
+{
+	if ( NULL == pSliceCtx || kuiSliceIdc < 0 )
+		return -1;
+	{
+		SSliceCtx *pSliceSeg = pSliceCtx;
+		if ( SM_SINGLE_SLICE != pSliceSeg->uiSliceMode )
+		{
+			if ( NULL == pSliceSeg->pCountMbNumInSlice || kuiSliceIdc >= pSliceSeg->iSliceNumInFrame )
+				return -1;
+			return pSliceSeg->pCountMbNumInSlice[ kuiSliceIdc ];
+		}
+		else /*if ( pSliceSeg->uiSliceMode == SM_SINGLE_SLICE )*/
+		{
+			if ( kuiSliceIdc > 0 || NULL == pSliceSeg->pCountMbNumInSlice )
+				return -1;
+			return pSliceSeg->pCountMbNumInSlice[ kuiSliceIdc ];
+		}
+	}
+}
+
+int32_t GetCurrentSliceNum( const SSliceCtx *kpSliceCtx )
+{
+	return (kpSliceCtx != NULL) ? (kpSliceCtx->iSliceNumInFrame) : (-1);
+}
+int32_t DynamicAdjustSlicePEncCtxAll(	SSliceCtx *pSliceCtx,
+											int32_t *pRunLength	)
+{
+	const int32_t iCountNumMbInFrame		= pSliceCtx->iMbNumInFrame;
+	const int32_t iCountSliceNumInFrame	= pSliceCtx->iSliceNumInFrame;
+	int32_t iSameRunLenFlag				= 1;
+	int32_t iFirstMbIdx					= 0;
+	int32_t iSliceIdx						= 0;
+
+	assert( iCountSliceNumInFrame <= MAX_THREADS_NUM );
+	
+	while( iSliceIdx < iCountSliceNumInFrame )
+	{
+		if (pRunLength[iSliceIdx] != pSliceCtx->pCountMbNumInSlice[iSliceIdx])
+		{
+			iSameRunLenFlag = 0;
+			break;
+		}
+		++ iSliceIdx;
+	}
+	if ( iSameRunLenFlag )
+	{
+		return 1;	// do not need adjust it due to same running length as before to save complexity
+	}
+
+	iSliceIdx = 0;
+	do {
+		const int32_t kiSliceRun	= pRunLength[iSliceIdx];
+
+		pSliceCtx->pFirstMbInSlice[iSliceIdx]			= iFirstMbIdx;
+		pSliceCtx->pCountMbNumInSlice[iSliceIdx]		= kiSliceRun;
+		
+		memset(pSliceCtx->pOverallMbMap+iFirstMbIdx, (uint8_t)iSliceIdx, kiSliceRun*sizeof(uint8_t));
+		
+		iFirstMbIdx += kiSliceRun;
+
+		++ iSliceIdx;
+	} while(iSliceIdx < iCountSliceNumInFrame && iFirstMbIdx < iCountNumMbInFrame);
+	
+	return 0;	
+}
+
+int32_t DynamicMaxSliceNumConstraint( uint32_t uiMaximumNum, int32_t iConsumedNum, uint32_t iDulplicateTimes  )
+{
+	return ( (uiMaximumNum-iConsumedNum-1)/iDulplicateTimes );
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/svc_encode_mb.cpp
@@ -1,0 +1,413 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file		encode_mb.c
+ *
+ * \brief		Implementaion for pCurMb encoding
+ *
+ * \date		05/19/2009 Created
+ *************************************************************************************
+ */
+
+#include <stdio.h>	//test use for file operation
+#include <string.h>
+
+#include "svc_encode_mb.h"
+#include "encode_mb_aux.h"
+#include "decode_mb_aux.h"
+#include "ls_defines.h"
+#include "cpu_core.h"
+#include "as264_common.h"
+#include "mb_cache.h"
+#include "array_stack_align.h"
+
+namespace WelsSVCEnc {
+void WelsDctMb(int16_t* pRes, uint8_t* pEncMb, int32_t iEncStride, uint8_t* pBestPred, PDctFunc pfDctFourT4)
+{
+    pfDctFourT4(pRes,			    pEncMb,							    iEncStride, pBestPred,			16);
+	pfDctFourT4(pRes + 64,		pEncMb + 8,						    iEncStride, pBestPred + 8,		16);
+	pfDctFourT4(pRes + 128,	pEncMb + 8 * iEncStride,		iEncStride, pBestPred + 128,	16);
+	pfDctFourT4(pRes + 192,	pEncMb + 8 * iEncStride + 8,	iEncStride, pBestPred + 136,	16);
+}
+
+void WelsEncRecI16x16Y(sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache)
+{
+	ENFORCE_STACK_ALIGN_1D(int16_t, aDctT4Dc, 16, 16)
+	SWelsFuncPtrList *pFuncList	= pEncCtx->pFuncList;
+	SDqLayer* pCurDqLayer	    = pEncCtx->pCurDqLayer;
+	const int32_t kiEncStride	        = pCurDqLayer->iEncStride[0];	
+	int16_t *pRes				     	= pMbCache->pCoeffLevel;
+	uint8_t *pPred				        = pMbCache->SPicData.pCsMb[0];
+	const int32_t kiRecStride     	= pCurDqLayer->iCsStride[0];
+	int16_t *pBlock				        = pMbCache->pDct->iLumaBlock[0]; 
+	uint8_t *pBestPred		    	= pMbCache->pMemPredLuma;
+	const uint8_t* kpNoneZeroCountIdx	= &g_kuiMbCountScan4Idx[0];	
+	uint8_t i, uiQp						    = pCurMb->uiLumaQp;
+	uint32_t uiNoneZeroCount, uiNoneZeroCountMbAc				= 0, uiCountI16x16Dc;
+
+	int16_t* pMF = g_kiQuantMF[uiQp], *pFF	= g_iQuantIntraFF[uiQp];
+
+	WelsDctMb(pRes,  pMbCache->SPicData.pEncMb[0], kiEncStride, pBestPred, pEncCtx->pFuncList->pfDctFourT4);
+
+	pFuncList->pfTransformHadamard4x4Dc(aDctT4Dc, pRes);
+	pFuncList->pfQuantizationDc4x4( aDctT4Dc, pFF[0]<<1, pMF[0]>>1);
+	pFuncList->pfScan4x4( pMbCache->pDct->iLumaI16x16Dc, aDctT4Dc);
+	uiCountI16x16Dc = pFuncList->pfGetNoneZeroCount(pMbCache->pDct->iLumaI16x16Dc);
+
+	for(i = 0; i < 4; i++)
+	{	
+		pFuncList->pfQuantizationFour4x4(pRes, pFF,  pMF);
+		pFuncList->pfScan4x4Ac(pBlock,		pRes		);
+		pFuncList->pfScan4x4Ac(pBlock + 16, pRes + 16	);
+		pFuncList->pfScan4x4Ac(pBlock + 32, pRes + 32	);
+		pFuncList->pfScan4x4Ac(pBlock + 48, pRes + 48	);
+		pRes += 64; 
+		pBlock += 64;	
+	}
+	pRes -= 256;
+	pBlock -= 256;
+
+	for(i=0; i<16; i++)	{
+		uiNoneZeroCount = pFuncList->pfGetNoneZeroCount(pBlock);
+		pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = uiNoneZeroCount;
+		uiNoneZeroCountMbAc += uiNoneZeroCount;
+		pBlock += 16;
+	}	
+
+	if( uiCountI16x16Dc > 0 ){
+		if(uiQp < 12) 		
+		{
+			WelsIHadamard4x4Dc(aDctT4Dc);
+			WelsDequantLumaDc4x4(aDctT4Dc, uiQp);
+		}
+		else
+			pFuncList->pfDequantizationIHadamard4x4(aDctT4Dc, g_kuiDequantCoeff[uiQp][0]>>2);
+	}
+
+	if( uiNoneZeroCountMbAc > 0 )
+	{		
+		pCurMb->uiCbp = 15;	
+        pFuncList->pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[uiQp]);
+		pFuncList->pfDequantizationFour4x4(pRes+64, g_kuiDequantCoeff[uiQp]);
+		pFuncList->pfDequantizationFour4x4(pRes+128, g_kuiDequantCoeff[uiQp]);
+		pFuncList->pfDequantizationFour4x4(pRes+192, g_kuiDequantCoeff[uiQp]);
+
+		pRes[0]  = aDctT4Dc[0];		pRes[16] = aDctT4Dc[1];  
+		pRes[32] = aDctT4Dc[4];		pRes[48] = aDctT4Dc[5];  
+		pRes[64] = aDctT4Dc[2];		pRes[80] = aDctT4Dc[3];  
+		pRes[96] = aDctT4Dc[6];		pRes[112]= aDctT4Dc[7];  
+		pRes[128]= aDctT4Dc[8];		pRes[144]= aDctT4Dc[9];  
+		pRes[160]= aDctT4Dc[12];		pRes[176]= aDctT4Dc[13]; 
+		pRes[192]= aDctT4Dc[10];		pRes[208]= aDctT4Dc[11]; 
+		pRes[224]= aDctT4Dc[14];		pRes[240]= aDctT4Dc[15]; 
+
+		pFuncList->pfIDctFourT4(pPred,					              kiRecStride, pBestPred,		       16, pRes		);
+		pFuncList->pfIDctFourT4(pPred + 8,				          kiRecStride, pBestPred + 8,	   16, pRes + 64 );
+		pFuncList->pfIDctFourT4(pPred + kiRecStride*8,	      kiRecStride, pBestPred + 128,  16, pRes + 128);
+		pFuncList->pfIDctFourT4(pPred + kiRecStride*8 + 8, kiRecStride, pBestPred + 136,  16, pRes + 192);
+	} 
+	else if( uiCountI16x16Dc > 0 ){
+		pFuncList->pfIDctI16x16Dc(pPred,	kiRecStride, pBestPred,	16, aDctT4Dc);
+	} 
+	else{	
+		pFuncList->pfCopy16x16Aligned(pPred, kiRecStride, pBestPred, 16);
+	}
+}
+void WelsEncRecI4x4Y( sWelsEncCtx *pEncCtx, SMB *pCurMb, SMbCache *pMbCache, uint8_t uiI4x4Idx)
+{
+	SWelsFuncPtrList *pFuncList	= pEncCtx->pFuncList;
+	SDqLayer* pCurDqLayer		= pEncCtx->pCurDqLayer;
+	int32_t iEncStride			= pCurDqLayer->iEncStride[0];
+	uint8_t uiQp					= pCurMb->uiLumaQp;
+
+	int16_t *pResI4x4 = pMbCache->pCoeffLevel;
+	uint8_t *pPredI4x4;
+
+	uint8_t *pPred     = pMbCache->SPicData.pCsMb[0];
+	int32_t iRecStride = pCurDqLayer->iCsStride[0];
+
+	uint32_t uiOffset = g_kuiMbCountScan4Idx[uiI4x4Idx];
+	uint8_t* pEncMb = pMbCache->SPicData.pEncMb[0];
+	uint8_t *pBestPred = pMbCache->pBestPredI4x4Blk4;
+	int16_t* pBlock = pMbCache->pDct->iLumaBlock[uiI4x4Idx];
+
+	int16_t *pMF = g_kiQuantMF[uiQp], *pFF = g_iQuantIntraFF[uiQp];
+
+	int32_t *pStrideEncBlockOffset = pEncCtx->pStrideTab->pStrideEncBlockOffset[pEncCtx->uiDependencyId];
+	int32_t *pStrideDecBlockOffset = pEncCtx->pStrideTab->pStrideDecBlockOffset[pEncCtx->uiDependencyId][0==pEncCtx->uiTemporalId];
+	int32_t iNoneZeroCount = 0;
+
+	pFuncList->pfDctT4( pResI4x4, &(pEncMb[pStrideEncBlockOffset[uiI4x4Idx]]), iEncStride, pBestPred, 4 );
+	pFuncList->pfQuantization4x4(pResI4x4, pFF, pMF);
+	pFuncList->pfScan4x4(pBlock, pResI4x4);
+	
+	iNoneZeroCount = pFuncList->pfGetNoneZeroCount(pBlock);
+	pCurMb->pNonZeroCount[uiOffset] = iNoneZeroCount;
+
+	pPredI4x4 = pPred + pStrideDecBlockOffset[uiI4x4Idx]; 
+	if ( iNoneZeroCount > 0 )
+	{
+		pCurMb->uiCbp |= 1 << (uiI4x4Idx>>2);
+		pFuncList->pfDequantization4x4( pResI4x4, g_kuiDequantCoeff[uiQp]);
+		pFuncList->pfIDctT4(pPredI4x4, iRecStride, pBestPred, 4, pResI4x4);
+	}
+	else
+		WelsCopy4x4(pPredI4x4, iRecStride, pBestPred, 4);
+}
+
+void WelsEncInterY(SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache)
+{    
+	PQuantizationMaxFunc pfQuantizationFour4x4Max	= pFuncList->pfQuantizationFour4x4Max;
+	PSetMemoryZero pfSetMemZeroSize8				        = pFuncList->pfSetMemZeroSize8;
+	PSetMemoryZero pfSetMemZeroSize64			        = pFuncList->pfSetMemZeroSize64;
+	PScanFunc pfScan4x4			                                    = pFuncList->pfScan4x4;
+	PCalculateSingleCtrFunc pfCalculateSingleCtr4x4		= pFuncList->pfCalculateSingleCtr4x4;
+	PGetNoneZeroCountFunc pfGetNoneZeroCount	    = pFuncList->pfGetNoneZeroCount;
+	PDeQuantizationFunc pfDequantizationFour4x4		= pFuncList->pfDequantizationFour4x4;
+	int16_t *pRes					                                    = pMbCache->pCoeffLevel;
+	int32_t iSingleCtrMb		= 0, iSingleCtr8x8[4];
+	int16_t* pBlock				= pMbCache->pDct->iLumaBlock[0]; 
+	uint8_t uiQp					= pCurMb->uiLumaQp;
+	int16_t *pMF					= g_kiQuantMF[uiQp], *pFF = g_kiQuantInterFF[uiQp], aMax[16];
+	int32_t i, j, iNoneZeroCountMbDcAc	= 0, iNoneZeroCount=0;	
+
+	for(i = 0; i < 4; i++)
+	{	
+		pfQuantizationFour4x4Max(pRes, pFF,  pMF, aMax+(i<<2));
+		iSingleCtr8x8[i] = 0;
+		for(j = 0; j < 4; j++)
+		{
+			if(aMax[(i<<2)+j] == 0)
+				pfSetMemZeroSize8(pBlock, 32);
+			else	
+			{
+				pfScan4x4(pBlock, pRes);		
+				if(aMax[(i<<2)+j] > 1)
+					iSingleCtr8x8[i] += 9;	
+				else if(iSingleCtr8x8[i] < 6)
+					iSingleCtr8x8[i] += pfCalculateSingleCtr4x4(pBlock);
+			}
+			pRes += 16; 
+			pBlock += 16;	
+		}
+		iSingleCtrMb += iSingleCtr8x8[i];
+	}
+	pBlock -= 256;
+	pRes -= 256;
+
+	memset(pCurMb->pNonZeroCount, 0, 16);  
+    
+   
+	if( iSingleCtrMb < 6 )	 //from JVT-O079
+    {		
+		iNoneZeroCountMbDcAc = 0;
+		pfSetMemZeroSize64( pRes,  768 );	// confirmed_safe_unsafe_usage
+	}
+	else
+	{
+		const uint8_t* kpNoneZeroCountIdx = g_kuiMbCountScan4Idx;
+		for(i = 0; i < 4; i++)
+		{
+			if( iSingleCtr8x8[i] >= 4 ){				
+				for( j = 0; j < 4; j++ ){
+					iNoneZeroCount = pfGetNoneZeroCount(pBlock);
+					pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = iNoneZeroCount;
+					iNoneZeroCountMbDcAc += iNoneZeroCount;
+       				pBlock += 16; 
+				}
+				pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[uiQp]);
+				pCurMb->uiCbp |= 1 << i;
+			}
+			else {	// set zero for an 8x8 pBlock
+				pfSetMemZeroSize64(pRes, 128);	// confirmed_safe_unsafe_usage
+				kpNoneZeroCountIdx += 4;
+				pBlock += 64; 
+			}	
+			pRes += 64;
+		}
+	}
+}
+
+void    WelsEncRecUV(SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache, int16_t * pRes, int32_t iUV)
+{
+	PQuantizationHadamardFunc pfQuantizationHadamard2x2		= pFuncList->pfQuantizationHadamard2x2;
+	PQuantizationMaxFunc pfQuantizationFour4x4Max	= pFuncList->pfQuantizationFour4x4Max;	
+	PSetMemoryZero pfSetMemZeroSize8				        = pFuncList->pfSetMemZeroSize8;
+	PSetMemoryZero pfSetMemZeroSize64				    = pFuncList->pfSetMemZeroSize64;
+	PScanFunc pfScan4x4Ac		                                	= pFuncList->pfScan4x4Ac;
+	PCalculateSingleCtrFunc pfCalculateSingleCtr4x4		= pFuncList->pfCalculateSingleCtr4x4;
+	PGetNoneZeroCountFunc pfGetNoneZeroCount	    = pFuncList->pfGetNoneZeroCount;
+	PDeQuantizationFunc pfDequantizationFour4x4		= pFuncList->pfDequantizationFour4x4;
+	const int32_t kiInterFlag				                            = !IS_INTRA( pCurMb->uiMbType);
+	const uint8_t	kiQp                                                   = pCurMb->uiChromaQp;
+	uint8_t i, uiNoneZeroCount, uiNoneZeroCountMbAc	= 0, uiNoneZeroCountMbDc = 0;
+	uint8_t uiNoneZeroCountOffset	                            = (iUV - 1)<<1;	//UV==1 or 2 
+	uint8_t uiSubMbIdx				                                = 16 + ((iUV - 1)<<2);			//uiSubMbIdx == 16 or 20
+	int16_t* iChromaDc			= pMbCache->pDct->iChromaDc[iUV-1], *pBlock = pMbCache->pDct->iChromaBlock[(iUV - 1)<<2];		
+	int16_t aDct2x2[4], j, aMax[4];
+	int32_t iSingleCtr8x8		= 0;
+	int16_t* pMF = g_kiQuantMF[kiQp], *pFF = g_kiQuantInterFF[(!kiInterFlag)*6+kiQp];
+
+	uiNoneZeroCountMbDc = pfQuantizationHadamard2x2(pRes, pFF[0]<<1, pMF[0]>>1, aDct2x2, iChromaDc);
+
+	pfQuantizationFour4x4Max(pRes, pFF,  pMF, aMax);
+
+	for(j = 0; j < 4; j++)
+	{	
+		if(aMax[j] == 0)
+			pfSetMemZeroSize8(pBlock, 32);
+		else	
+		{
+			pfScan4x4Ac(pBlock, pRes);	
+			if(kiInterFlag)				
+			{
+				if(aMax[j] > 1)
+					iSingleCtr8x8 += 9;	
+				else if(iSingleCtr8x8 < 7)
+					iSingleCtr8x8 += pfCalculateSingleCtr4x4(pBlock);
+			}
+			else
+				iSingleCtr8x8 = INT_MAX;
+		}
+		pRes += 16; 
+		pBlock += 16;	
+	}	
+	pRes -= 64;
+
+	if(  iSingleCtr8x8 < 7 )	//from JVT-O079
+	{		
+		pfSetMemZeroSize64(pRes, 128);	// confirmed_safe_unsafe_usage
+		ST16( &pCurMb->pNonZeroCount[16+uiNoneZeroCountOffset], 0 );
+		ST16( &pCurMb->pNonZeroCount[20+uiNoneZeroCountOffset], 0 );
+	}
+	else
+	{
+		const uint8_t* kpNoneZeroCountIdx = &g_kuiMbCountScan4Idx[uiSubMbIdx];
+		pBlock -= 64;
+		for(i=0; i<4; i++){
+			uiNoneZeroCount = pfGetNoneZeroCount(pBlock);
+			pCurMb->pNonZeroCount[*kpNoneZeroCountIdx++] = uiNoneZeroCount;
+			uiNoneZeroCountMbAc += uiNoneZeroCount;
+			pBlock += 16;
+		}
+		pfDequantizationFour4x4(pRes, g_kuiDequantCoeff[pCurMb->uiChromaQp]);
+		pCurMb->uiCbp &= 0x0F;
+		pCurMb->uiCbp |= 0x20;	
+	}
+
+	if (uiNoneZeroCountMbDc > 0)
+	{	
+		WelsDequantIHadamard2x2Dc(aDct2x2, g_kuiDequantCoeff[kiQp][0] >> 1);
+		if ( 2 != (pCurMb->uiCbp >> 4) )
+			pCurMb->uiCbp |= (0x01 << 4) ;
+ 		pRes[0]	= aDct2x2[0];
+ 		pRes[16]	= aDct2x2[1];
+ 		pRes[32]	= aDct2x2[2];
+ 		pRes[48]	= aDct2x2[3];                   
+	}
+}
+
+
+void    WelsRecPskip(SDqLayer *pCurLayer, SWelsFuncPtrList *pFuncList, SMB * pCurMb, SMbCache *pMbCache)
+{	
+	int32_t* iRecStride	= pCurLayer->iCsStride;
+	uint8_t** pCsMb		= &pMbCache->SPicData.pCsMb[0];
+
+	pFuncList->pfCopy16x16Aligned(pCsMb[0],	*iRecStride++,	pMbCache->pSkipMb,		16);
+	pFuncList->pfCopy8x8Aligned(	pCsMb[1],	*iRecStride++,	pMbCache->pSkipMb + 256,	8);
+	pFuncList->pfCopy8x8Aligned(	pCsMb[2],	*iRecStride,	pMbCache->pSkipMb + 320,	8);
+	pFuncList->pfSetMemZeroSize8(		pCurMb->pNonZeroCount,	24 );
+}
+
+BOOL_T WelsTryPYskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache)
+{
+	int32_t iSingleCtrMb	= 0;
+	int16_t *pRes = pMbCache->pCoeffLevel;
+	const uint8_t kuiQp = pCurMb->uiLumaQp;	
+
+	int16_t* pBlock = pMbCache->pDct->iLumaBlock[0];		
+	uint16_t aMax[4], i, j;
+	int16_t* pMF = g_kiQuantMF[kuiQp], *pFF = g_kiQuantInterFF[kuiQp];
+
+    for(i = 0; i < 4; i++)
+    {		
+		pEncCtx->pFuncList->pfQuantizationFour4x4Max(pRes, pFF,  pMF, (int16_t*)aMax);	
+
+		for(j = 0; j < 4; j++)
+		{	
+			if(aMax[j] > 1) return FALSE;	// iSingleCtrMb += 9, can't be P_SKIP				
+			else if( aMax[j] == 1) 
+			{	
+				pEncCtx->pFuncList->pfScan4x4(pBlock, pRes); //
+				iSingleCtrMb += pEncCtx->pFuncList->pfCalculateSingleCtr4x4(pBlock);	
+			}		
+			if(iSingleCtrMb >= 6) 	return FALSE; //from JVT-O079
+			pRes += 16; 
+			pBlock += 16;
+		}
+	}	
+    return TRUE;
+}
+
+BOOL_T    WelsTryPUVskip(sWelsEncCtx * pEncCtx, SMB * pCurMb, SMbCache *pMbCache, int32_t iUV)
+{
+	int16_t* pRes = ((iUV == 1) ? &(pMbCache->pCoeffLevel[256]):&(pMbCache->pCoeffLevel[256+64]));	
+
+	const uint8_t kuiQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pEncCtx->pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
+
+	int16_t* pMF = g_kiQuantMF[kuiQp], *pFF = g_kiQuantInterFF[kuiQp];
+
+	if(pEncCtx->pFuncList->pfQuantizationHadamard2x2Skip(pRes, pFF[0]<<1, pMF[0]>>1))
+		return FALSE;
+	else
+	{
+		uint16_t aMax[4], j;
+		int32_t iSingleCtrMb = 0;
+		int16_t* pBlock = pMbCache->pDct->iChromaBlock[(iUV-1)<<2];
+		pEncCtx->pFuncList->pfQuantizationFour4x4Max(pRes, pFF,  pMF, (int16_t*)aMax);
+
+		for(j = 0; j < 4; j++)
+		{
+			if( aMax[j] > 1)		return FALSE;	// iSingleCtrMb += 9, can't be P_SKIP			
+			else if( aMax[j] == 1)
+			{	
+				pEncCtx->pFuncList->pfScan4x4Ac(pBlock, pRes);				
+				iSingleCtrMb += pEncCtx->pFuncList->pfCalculateSingleCtr4x4(pBlock);	
+			}		
+			if(iSingleCtrMb >= 7) return FALSE; //from JVT-O079
+			pRes += 16; 
+			pBlock += 16;	
+		}
+		return TRUE;
+	}
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/svc_encode_slice.cpp
@@ -1,0 +1,1194 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_encode_slice.c
+ *
+ * \brief	svc encoding slice 
+ *
+ * \date	2009.07.27 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include <assert.h>
+#include "ls_defines.h"
+#include "svc_encode_slice.h"
+#include "svc_enc_golomb.h"
+#include "svc_base_layer_md.h"
+#include "svc_encode_mb.h"
+#include "mv_pred.h"
+#include "svc_set_mb_syn_cavlc.h"
+#include "encode_mb_aux.h"
+#include "decode_mb_aux.h"
+#include "svc_mode_decision.h"
+#include "cpu_core.h"
+#include "svc_motion_estimate.h"
+#include "sample.h"
+#include "wels_func_ptr_def.h"
+#include "utils.h"
+
+namespace WelsSVCEnc {
+//#define ENC_TRACE
+ 
+typedef void (*PWelsCodingSliceFunc)( sWelsEncCtx *pCtx, SSlice *pSlice );
+typedef void (*PWelsSliceHeaderWriteFunc)( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, int32_t* pPpsIdDelta );
+
+void UpdateNonZeroCountCache(SMB *pMb, SMbCache *pMbCache)
+{
+	ST32(&pMbCache->iNonZeroCoeffCount[9], LD32(&pMb->pNonZeroCount[ 0]));
+	ST32(&pMbCache->iNonZeroCoeffCount[17], LD32(&pMb->pNonZeroCount[ 4]));
+	ST32(&pMbCache->iNonZeroCoeffCount[25], LD32(&pMb->pNonZeroCount[ 8]));
+	ST32(&pMbCache->iNonZeroCoeffCount[33], LD32(&pMb->pNonZeroCount[12]));	
+	
+	ST16(&pMbCache->iNonZeroCoeffCount[14], LD16(&pMb->pNonZeroCount[16]));
+	ST16(&pMbCache->iNonZeroCoeffCount[38], LD16(&pMb->pNonZeroCount[18]));
+	ST16(&pMbCache->iNonZeroCoeffCount[22], LD16(&pMb->pNonZeroCount[20]));
+	ST16(&pMbCache->iNonZeroCoeffCount[46], LD16(&pMb->pNonZeroCount[22]));
+}
+
+void WelsSliceHeaderScalExtInit( SDqLayer* pCurLayer, SSlice *pSlice )
+{
+	SSliceHeaderExt* pSliceHeadExt	= &pSlice->sSliceHeaderExt;
+	SNalUnitHeaderExt* pNalHeadExt= &pCurLayer->sLayerInfo.sNalHeaderExt;
+	
+	uint8_t uiDependencyId	= pNalHeadExt->uiDependencyId;
+
+	pSliceHeadExt->bSliceSkipFlag = false;	
+
+	if ( uiDependencyId > 0 ) //spatial EL
+	{
+		//bothe adaptive and default flags should equal to 0.
+		pSliceHeadExt->bAdaptiveBaseModeFlag     = 
+			pSliceHeadExt->bAdaptiveMotionPredFlag   = 
+			pSliceHeadExt->bAdaptiveResidualPredFlag = false;
+
+		pSliceHeadExt->bDefaultBaseModeFlag     = 
+			pSliceHeadExt->bDefaultMotionPredFlag   =
+			pSliceHeadExt->bDefaultResidualPredFlag = false;
+	}
+}
+
+void WelsSliceHeaderExtInit( sWelsEncCtx* pEncCtx, SDqLayer* pCurLayer, SSlice *pSlice )
+{
+	SSliceHeaderExt* pCurSliceExt = &pSlice->sSliceHeaderExt;
+	SSliceHeader* pCurSliceHeader  = &pCurSliceExt->sSliceHeader;	
+	
+	pCurSliceHeader->eSliceType	= pEncCtx->eSliceType;
+
+	pCurSliceExt->bStoreRefBasePicFlag = false;	
+
+	pCurSliceHeader->iFirstMbInSlice = WelsGetFirstMbOfSlice( pCurLayer->pSliceEncCtx, pSlice->uiSliceIdx );
+
+	pCurSliceHeader->iFrameNum      = pEncCtx->iFrameNum;	
+	pCurSliceHeader->uiIdrPicId     = pEncCtx->sPSOVector.uiIdrPicId; //??
+
+	pCurSliceHeader->iPicOrderCntLsb          = pEncCtx->pEncPic->iFramePoc;	// 0
+
+	if ( P_SLICE == pEncCtx->eSliceType  )
+	{
+		pCurSliceHeader->uiNumRefIdxL0Active	= 1;
+		if ( pCurSliceHeader->uiRefCount > 0 && 
+			pCurSliceHeader->uiRefCount < pCurLayer->sLayerInfo.pSpsP->iNumRefFrames )
+		{
+			pCurSliceHeader->bNumRefIdxActiveOverrideFlag = true;
+			pCurSliceHeader->uiNumRefIdxL0Active	= pCurSliceHeader->uiRefCount;
+		}
+		//to solve mismatch between debug&release
+		else
+		{
+			pCurSliceHeader->bNumRefIdxActiveOverrideFlag = false;
+		}
+	}
+
+	pCurSliceHeader->iSliceQpDelta = pEncCtx->iGlobalQp - pCurLayer->sLayerInfo.pPpsP->iPicInitQp;
+
+	//for deblocking initial
+	pCurSliceHeader->uiDisableDeblockingFilterIdc			= pCurLayer->iLoopFilterDisableIdc;
+	pCurSliceHeader->iSliceAlphaC0Offset					= pCurLayer->iLoopFilterAlphaC0Offset;	//	need update iSliceAlphaC0Offset & iSliceBetaOffset for pSlice-header if loop_filter_idc != 1
+	pCurSliceHeader->iSliceBetaOffset						= pCurLayer->iLoopFilterBetaOffset;
+	pCurSliceExt->uiDisableInterLayerDeblockingFilterIdc = pCurLayer->uiDisableInterLayerDeblockingFilterIdc;
+
+	if ( pSlice->bSliceHeaderExtFlag )
+	{
+		WelsSliceHeaderScalExtInit( pCurLayer, pSlice );
+	}
+	else
+	{
+		//both adaptive and default flags should equal to 0.
+		pCurSliceExt->bAdaptiveBaseModeFlag		= 
+		pCurSliceExt->bAdaptiveMotionPredFlag		= 
+		pCurSliceExt->bAdaptiveResidualPredFlag	= false;
+		
+		pCurSliceExt->bDefaultBaseModeFlag		= 
+		pCurSliceExt->bDefaultMotionPredFlag		=
+		pCurSliceExt->bDefaultResidualPredFlag	= false;
+	}
+}
+
+/* count MB types if enabled FRAME_INFO_OUTPUT*/
+#if defined(MB_TYPES_CHECK)
+void WelsCountMbType(int32_t (*iMbCount)[18], const EWelsSliceType keSt, const SMB* kpMb)
+{	
+	if (NULL == iMbCount)
+		return;
+	
+	switch( kpMb->uiMbType ) {
+	case MB_TYPE_INTRA4x4:
+		++ iMbCount[keSt][Intra4x4];
+		break;
+	case MB_TYPE_INTRA16x16:
+		++ iMbCount[keSt][Intra16x16];
+		break;
+	case MB_TYPE_SKIP:
+		++ iMbCount[keSt][PSkip];
+		break;
+	case MB_TYPE_16x16:
+		++ iMbCount[keSt][Inter16x16];
+		break;
+	case MB_TYPE_16x8:
+		++ iMbCount[keSt][Inter16x8];
+		break;
+	case MB_TYPE_8x16:
+		++ iMbCount[eSt][Inter8x16];
+		break;
+	case MB_TYPE_8x8:
+		++ iMbCount[keSt][Inter8x8];
+		break;
+	case MB_TYPE_INTRA_BL:
+		++ iMbCount[keSt][7];
+		break;
+	default:
+		break;
+	}
+}
+#endif//MB_TYPES_CHECK
+
+/*!
+* \brief	write reference picture list on reordering syntax in Slice header	
+*/
+void WriteReferenceReorder( SBitStringAux *pBs, SSliceHeader *sSliceHeader )
+{
+	SRefPicListReorderSyntax *pRefOrdering	= &sSliceHeader->sRefReordering;
+	uint8_t eSliceType						= sSliceHeader->eSliceType % 5;
+	int16_t n = 0;
+
+	if (  I_SLICE != eSliceType && SI_SLICE != eSliceType )	// !I && !SI
+	{
+		BsWriteOneBit( pBs, true );
+//		{
+			uint16_t uiReorderingOfPicNumsIdc;
+			do 
+			{
+				uiReorderingOfPicNumsIdc = pRefOrdering->SReorderingSyntax[n].uiReorderingOfPicNumsIdc; 
+				BsWriteUE( pBs, uiReorderingOfPicNumsIdc );
+				if ( 0 == uiReorderingOfPicNumsIdc || 1 == uiReorderingOfPicNumsIdc )
+					BsWriteUE( pBs, pRefOrdering->SReorderingSyntax[n].uiAbsDiffPicNumMinus1 );
+				else if ( 2 == uiReorderingOfPicNumsIdc )
+					BsWriteUE( pBs, pRefOrdering->SReorderingSyntax[n].iLongTermPicNum );
+
+				n ++;
+			} while ( 3 != uiReorderingOfPicNumsIdc );
+//		}
+	}
+}
+
+/*!
+* \brief	write reference picture marking syntax in pSlice header	
+*/
+void WriteRefPicMarking( SBitStringAux *pBs, SSliceHeader *pSliceHeader, SNalUnitHeaderExt *pNalHdrExt )
+{
+	SRefPicMarking *sRefMarking	= &pSliceHeader->sRefMarking;
+	int16_t n = 0;	
+
+	if ( pNalHdrExt->bIdrFlag )
+	{
+		BsWriteOneBit( pBs, sRefMarking->bNoOutputOfPriorPicsFlag );
+		BsWriteOneBit( pBs, sRefMarking->bLongTermRefFlag );
+	}
+	else 
+	{
+		BsWriteOneBit( pBs, sRefMarking->bAdaptiveRefPicMarkingModeFlag );
+
+		if ( sRefMarking->bAdaptiveRefPicMarkingModeFlag )
+		{
+			int32_t iMmcoType;
+			do 
+			{
+				iMmcoType = sRefMarking->SMmcoRef[n].iMmcoType;
+				BsWriteUE( pBs, iMmcoType );
+				if ( 1 == iMmcoType || 3 == iMmcoType )
+					BsWriteUE( pBs, sRefMarking->SMmcoRef[n].iDiffOfPicNum - 1 );
+
+				if ( 2 == iMmcoType )
+					BsWriteUE( pBs, sRefMarking->SMmcoRef[n].iLongTermPicNum );
+
+				if ( 3 == iMmcoType || 6 == iMmcoType )
+					BsWriteUE( pBs, sRefMarking->SMmcoRef[n].iLongTermFrameIdx );
+
+				if ( 4 == iMmcoType )
+					BsWriteUE( pBs, sRefMarking->SMmcoRef[n].iMaxLongTermFrameIdx + 1 );
+
+				n ++;
+			} while ( 0 != iMmcoType );
+		}
+
+	}
+}
+
+void WelsSliceHeaderWrite( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, int32_t* pPpsIdDelta )
+{
+	SWelsSPS* pSps = pCurLayer->sLayerInfo.pSpsP;
+	SWelsPPS* pPps = pCurLayer->sLayerInfo.pPpsP;
+	SSliceHeader* pSliceHeader      = &pSlice->sSliceHeaderExt.sSliceHeader;	
+	SNalUnitHeaderExt* pNalHead   = &pCurLayer->sLayerInfo.sNalHeaderExt;	
+
+	BsWriteUE( pBs, pSliceHeader->iFirstMbInSlice );
+	BsWriteUE( pBs, pSliceHeader->eSliceType );   /* same type things */
+
+	BsWriteUE( pBs, pSliceHeader->pPps->iPpsId + pPpsIdDelta[pSliceHeader->pPps->iPpsId] );
+
+	BsWriteBits( pBs, pSps->uiLog2MaxFrameNum, pSliceHeader->iFrameNum );
+
+	if( pNalHead->bIdrFlag ) /* NAL IDR */
+	{
+		BsWriteUE( pBs, pSliceHeader->uiIdrPicId );
+	}
+
+	BsWriteBits( pBs, pSps->iLog2MaxPocLsb, pSliceHeader->iPicOrderCntLsb );
+
+	if ( P_SLICE == pSliceHeader->eSliceType )
+	{
+		BsWriteOneBit( pBs, pSliceHeader->bNumRefIdxActiveOverrideFlag );
+		if ( pSliceHeader->bNumRefIdxActiveOverrideFlag )
+		{
+			BsWriteUE( pBs, pSliceHeader->uiNumRefIdxL0Active - 1 );
+		}
+	}
+
+	if ( !pNalHead->bIdrFlag )
+		WriteReferenceReorder( pBs, pSliceHeader );
+
+	if ( pNalHead->sNalHeader.uiNalRefIdc )
+	{
+		WriteRefPicMarking( pBs, pSliceHeader, pNalHead );
+	}	
+
+	BsWriteSE( pBs, pSliceHeader->iSliceQpDelta );      /* pSlice qp delta */
+
+	if( pPps->bDeblockingFilterControlPresentFlag )
+	{
+		switch( pSliceHeader->uiDisableDeblockingFilterIdc )
+		{
+		case 0:
+		case 3:
+		case 4:
+		case 6:
+			BsWriteUE( pBs, 0 );
+			break;
+		case 1:
+			BsWriteUE( pBs, 1 );
+			break;
+		case 2:
+		case 5:
+			BsWriteUE( pBs, 2 );
+			break;
+		default :
+			fprintf( stderr, "pData error for deblocking" );
+			break;
+		}
+		if ( 1 != pSliceHeader->uiDisableDeblockingFilterIdc )
+		{
+			BsWriteSE( pBs, pSliceHeader->iSliceAlphaC0Offset >> 1 );
+			BsWriteSE( pBs, pSliceHeader->iSliceBetaOffset >> 1 );
+		}
+	}	
+}
+
+void WelsSliceHeaderExtWrite( SBitStringAux* pBs, SDqLayer* pCurLayer, SSlice *pSlice, int32_t *pPpsIdDelta )
+{
+	SWelsSPS* pSps           = pCurLayer->sLayerInfo.pSpsP;	
+	SWelsPPS* pPps           = pCurLayer->sLayerInfo.pPpsP;
+	SSubsetSps* pSubSps = pCurLayer->sLayerInfo.pSubsetSpsP;
+	SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
+	SSliceHeader* pSliceHeader      = &pSliceHeadExt->sSliceHeader;
+	SNalUnitHeaderExt* pNalHead   = &pCurLayer->sLayerInfo.sNalHeaderExt;
+
+	BsWriteUE( pBs, pSliceHeader->iFirstMbInSlice );
+	BsWriteUE( pBs, pSliceHeader->eSliceType );   /* same type things */
+
+	BsWriteUE( pBs, pSliceHeader->pPps->iPpsId + pPpsIdDelta[pSliceHeader->pPps->iPpsId] );
+
+	BsWriteBits( pBs, pSps->uiLog2MaxFrameNum, pSliceHeader->iFrameNum );
+
+	if( pNalHead->bIdrFlag ) /* NAL IDR */
+	{
+		BsWriteUE( pBs, pSliceHeader->uiIdrPicId );
+	}
+
+	BsWriteBits( pBs, pSps->iLog2MaxPocLsb, pSliceHeader->iPicOrderCntLsb );
+//	{
+		if ( P_SLICE == pSliceHeader->eSliceType )
+		{
+			BsWriteOneBit( pBs, pSliceHeader->bNumRefIdxActiveOverrideFlag );
+			if ( pSliceHeader->bNumRefIdxActiveOverrideFlag )
+			{
+				BsWriteUE( pBs, pSliceHeader->uiNumRefIdxL0Active - 1 );
+			}
+		}
+
+		if ( !pNalHead->bIdrFlag )
+			WriteReferenceReorder( pBs, pSliceHeader );
+
+		if ( pNalHead->sNalHeader.uiNalRefIdc )
+		{
+			WriteRefPicMarking( pBs, pSliceHeader, pNalHead );
+
+			if ( !pSubSps->sSpsSvcExt.bSliceHeaderRestrictionFlag )
+			{
+				BsWriteOneBit( pBs, pSliceHeadExt->bStoreRefBasePicFlag );
+			}
+		}
+//	}
+
+	BsWriteSE( pBs, pSliceHeader->iSliceQpDelta );      /* pSlice qp delta */
+
+	if( pPps->bDeblockingFilterControlPresentFlag )
+	{
+		BsWriteUE( pBs, pSliceHeader->uiDisableDeblockingFilterIdc );
+		if ( 1 != pSliceHeader->uiDisableDeblockingFilterIdc )
+		{
+			BsWriteSE( pBs, pSliceHeader->iSliceAlphaC0Offset >> 1 );
+			BsWriteSE( pBs, pSliceHeader->iSliceBetaOffset >> 1 );
+		}
+	}	
+
+#if !defined(DISABLE_FMO_FEATURE)
+	if ( pPps->uiNumSliceGroups > 1  &&
+		pPps->uiSliceGroupMapType >= 3 && 
+		pPps->uiSliceGroupMapType <= 5 )
+	{
+		int32_t iNumBits;
+		if ( pPps->uiSliceGroupChangeRate )
+		{
+			iNumBits = WELS_CEILLOG2(1 + pPps->uiPicSizeInMapUnits / pPps->uiSliceGroupChangeRate);
+			BsWriteBits( pBs, iNumBits, pSliceHeader->iSliceGroupChangeCycle );	
+		}
+	}
+#endif//!DISABLE_FMO_FEATURE
+
+	if ( false )
+	{
+		BsWriteOneBit( pBs, pSliceHeadExt->bSliceSkipFlag );
+		if ( pSliceHeadExt->bSliceSkipFlag )
+		{
+			BsWriteUE( pBs, pSliceHeadExt->uiNumMbsInSlice - 1 );
+		}
+		else
+		{
+			BsWriteOneBit( pBs, pSliceHeadExt->bAdaptiveBaseModeFlag );
+			if ( !pSliceHeadExt->bAdaptiveBaseModeFlag )  
+			{
+				BsWriteOneBit( pBs, pSliceHeadExt->bDefaultBaseModeFlag );
+			}
+
+			if ( !pSliceHeadExt->bDefaultBaseModeFlag )
+			{
+				BsWriteOneBit( pBs, 0 );
+				BsWriteOneBit( pBs, 0 );
+			}
+
+			BsWriteOneBit( pBs, pSliceHeadExt->bAdaptiveResidualPredFlag );
+			if ( !pSliceHeadExt->bAdaptiveResidualPredFlag )
+			{
+				BsWriteOneBit( pBs, 0);
+			}
+		}
+		if ( 1 == pSubSps->sSpsSvcExt.bAdaptiveTcoeffLevelPredFlag )
+		{
+			BsWriteOneBit( pBs, pSliceHeadExt->bTcoeffLevelPredFlag );
+		}
+
+	}
+
+	if ( !pSubSps->sSpsSvcExt.bSliceHeaderRestrictionFlag )
+	{
+		BsWriteBits( pBs, 4, 0 );
+		BsWriteBits( pBs, 4, 15 );
+	}
+}
+
+//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
+//only for inter part
+void WelsInterMbEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb )
+{
+	SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+
+	WelsDctMb(pMbCache->pCoeffLevel,  pMbCache->SPicData.pEncMb[0], pEncCtx->pCurDqLayer->iEncStride[0], pMbCache->pMemPredLuma, pEncCtx->pFuncList->pfDctFourT4 );
+	WelsEncInterY( pEncCtx->pFuncList, pCurMb, pMbCache );
+}
+
+
+//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
+//only for I SSlice
+void WelsIMbChromaEncode( sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache *pMbCache )
+{
+	SWelsFuncPtrList *pFunc	= pEncCtx->pFuncList;
+	SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;	
+	const int32_t kiEncStride	= pCurLayer->iEncStride[1];
+	const int32_t kiCsStride		= pCurLayer->iCsStride[1];
+	int16_t *pCurRS				= pMbCache->pCoeffLevel;
+	uint8_t* pBestPred			= pMbCache->pBestPredIntraChroma;
+	uint8_t* pCsCb				= pMbCache->SPicData.pCsMb[1];
+	uint8_t* pCsCr				= pMbCache->SPicData.pCsMb[2];
+
+	//cb
+	pFunc->pfDctFourT4( pCurRS,    pMbCache->SPicData.pEncMb[1], kiEncStride, pBestPred,    8);
+	WelsEncRecUV( pFunc, pCurMb, pMbCache, pCurRS,    1 );
+	pFunc->pfIDctFourT4( pCsCb, kiCsStride, pBestPred,    8, pCurRS    );
+	
+	//cr
+	pFunc->pfDctFourT4( pCurRS+64, pMbCache->SPicData.pEncMb[2], kiEncStride, pBestPred+64, 8);
+	WelsEncRecUV( pFunc, pCurMb, pMbCache, pCurRS+64, 2 );
+	pFunc->pfIDctFourT4( pCsCr, kiCsStride, pBestPred+64, 8, pCurRS+64 );
+}
+
+
+//only BaseLayer inter MB and SpatialLayer (uiQualityId = 0) inter MB calling this pFunc.
+//for P SSlice (intra part + inter part)
+void WelsPMbChromaEncode( sWelsEncCtx* pEncCtx, SSlice *pSlice, SMB* pCurMb )
+{
+	SWelsFuncPtrList *pFunc	= pEncCtx->pFuncList;	
+	SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;	
+	const int32_t kiEncStride	= pCurLayer->iEncStride[1];
+	SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
+	int16_t *pCurRS				= pMbCache->pCoeffLevel+256;
+	uint8_t* pBestPred			= pMbCache->pMemPredChroma;		
+
+	pFunc->pfDctFourT4(pCurRS,		pMbCache->SPicData.pEncMb[1],	kiEncStride,		pBestPred,		8);	
+	pFunc->pfDctFourT4(pCurRS+64,	pMbCache->SPicData.pEncMb[2],	kiEncStride,		pBestPred+64,	8);	
+	
+	WelsEncRecUV(pFunc, pCurMb, pMbCache, pCurRS, 1);
+	WelsEncRecUV(pFunc, pCurMb, pMbCache, pCurRS+64, 2);
+}
+
+void OutputPMbWithoutConstructCsRsNoCopy( sWelsEncCtx *pCtx, SDqLayer* pDq, SSlice *pSlice, SMB* pMb )
+{	
+	if ( IS_INTER( pMb->uiMbType ) || IS_I_BL(pMb->uiMbType) )		//intra have been reconstructed, NO COPY from CS to pDecPic--
+	{
+		SMbCache* pMbCache			= &pSlice->sMbCacheInfo;
+		uint8_t* pDecY				= pMbCache->SPicData.pDecMb[0];
+		uint8_t* pDecU				= pMbCache->SPicData.pDecMb[1];
+		uint8_t* pDecV				= pMbCache->SPicData.pDecMb[2];
+		int16_t *pScaledTcoeff		= pMbCache->pCoeffLevel;
+		const int32_t kiDecStrideLuma	= pDq->pDecPic->iLineSize[0];
+		const int32_t kiDecStrideChroma	= pDq->pDecPic->iLineSize[1];
+		PIDctFunc pfIdctFour4x4				= pCtx->pFuncList->pfIDctFourT4;
+
+		WelsIDctT4RecOnMb( pDecY, kiDecStrideLuma, pDecY, kiDecStrideLuma, pScaledTcoeff,  pfIdctFour4x4 );
+		pfIdctFour4x4( pDecU, kiDecStrideChroma, pDecU, kiDecStrideChroma, pScaledTcoeff + 256 );
+		pfIdctFour4x4( pDecV, kiDecStrideChroma, pDecV, kiDecStrideChroma, pScaledTcoeff + 320 );
+	}
+}
+
+// for intra non-dynamic pSlice
+//encapsulate two kinds of reconstruction:
+//first. store base or highest Dependency Layer with only one quality (without CS RS reconstruction)
+//second. lower than highest Dependency Layer, and for every Dependency Layer with one quality layer(single layer) 
+void WelsISliceMdEnc( sWelsEncCtx* pEncCtx, SSlice *pSlice ) //pMd + encoding
+{
+	SDqLayer* pCurLayer				= pEncCtx->pCurDqLayer;
+	SSliceCtx* pSliceCtx		= pCurLayer->pSliceEncCtx;
+	SMbCache *pMbCache				= &pSlice->sMbCacheInfo;
+	SSliceHeaderExt *pSliceHdExt	= &pSlice->sSliceHeaderExt;
+	SMB* pMbList						= pCurLayer->sMbDataP;
+	SMB* pCurMb						= NULL;	
+	const int32_t kiSliceFirstMbXY	= pSliceHdExt->sSliceHeader.iFirstMbInSlice;
+	int32_t iNextMbIdx				= kiSliceFirstMbXY;	
+	const int32_t kiTotalNumMb		= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
+	int32_t iCurMbIdx				= 0, iNumMbCoded = 0;	
+	const int32_t kiSliceIdx			= pSlice->uiSliceIdx;
+	const uint8_t kuiChromaQpIndexOffset= pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
+	SWelsMD sMd;	
+	
+	for ( ; ; )
+	{
+		iCurMbIdx	= iNextMbIdx;
+		pCurMb = &pMbList[ iCurMbIdx ];	
+		pCurMb->uiLumaQp   = pEncCtx->iGlobalQp;
+		pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+
+		pEncCtx->pFuncList->pfRc.pfWelsRcMbInit(pEncCtx, pCurMb, pSlice);		
+
+		sMd.iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
+
+		WelsMdIntraInit( pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY );
+		WelsMdIntraMb( pEncCtx, &sMd, pCurMb, pMbCache );
+		UpdateNonZeroCountCache( pCurMb, pMbCache );
+		
+		WelsSpatialWriteMbSyn( pEncCtx, pSlice, pCurMb );
+
+		pCurMb->uiSliceIdc = kiSliceIdx;
+		
+        #if defined(MB_TYPES_CHECK) 
+		WelsCountMbType( pEncCtx->sPerInfo.iMbCount, I_SLICE, pCurMb );		
+        #endif//MB_TYPES_CHECK
+	
+		pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate(pEncCtx,pCurMb,sMd.iCostLuma,pSlice);
+
+		++iNumMbCoded;		
+
+		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
+		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb )
+		{
+			break;
+		}
+	}
+}
+
+// Only for intra dynamic slicing
+void WelsISliceMdEncDynamic( sWelsEncCtx* pEncCtx, SSlice *pSlice ) //pMd + encoding
+{
+	SBitStringAux* pBs				= pSlice->pSliceBsa;
+	SDqLayer* pCurLayer				= pEncCtx->pCurDqLayer;
+	SSliceCtx* pSliceCtx		= pCurLayer->pSliceEncCtx;
+	SMbCache *pMbCache				= &pSlice->sMbCacheInfo;
+	SSliceHeaderExt *pSliceHdExt	= &pSlice->sSliceHeaderExt;
+	SMB* pMbList						= pCurLayer->sMbDataP;
+	SMB* pCurMb						= NULL;	
+	const int32_t kiSliceFirstMbXY	= pSliceHdExt->sSliceHeader.iFirstMbInSlice;
+	int32_t iNextMbIdx				= kiSliceFirstMbXY;	
+	const int32_t kiTotalNumMb		= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
+	int32_t iCurMbIdx				= 0, iNumMbCoded = 0;	
+	const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
+	const int32_t kiPartitionId			= (kiSliceIdx % pEncCtx->iActiveThreadsNum);
+	const uint8_t kuiChromaQpIndexOffset= pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
+
+	SWelsMD sMd;	
+	SDynamicSlicingStack sDss;
+	sDss.iStartPos = BsGetBitsPos(pBs);
+
+	for ( ; ; )
+	{
+		iCurMbIdx	= iNextMbIdx;
+		pCurMb = &pMbList[ iCurMbIdx ];	
+		pCurMb->uiLumaQp   = pEncCtx->iGlobalQp;
+		pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+
+		pEncCtx->pFuncList->pfRc.pfWelsRcMbInit(pEncCtx, pCurMb, pSlice);
+		// if already reaches the largest number of slices, set QPs to the upper bound
+		if (pSlice->bDynamicSlicingSliceSizeCtrlFlag)
+		{			
+			pCurMb->uiLumaQp = pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId].iMaxQp;
+			pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+		}
+
+		sMd.iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
+
+		WelsMdIntraInit( pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY );
+		WelsMdIntraMb( pEncCtx, &sMd, pCurMb, pMbCache );
+		UpdateNonZeroCountCache( pCurMb, pMbCache );
+		//stack pBs pointer
+		sDss.pBsStackBufPtr	= pBs->pBufPtr;
+		sDss.uiBsStackCurBits	= pBs->uiCurBits;
+		sDss.iBsStackLeftBits	= pBs->iLeftBits;
+
+		WelsSpatialWriteMbSyn( pEncCtx, pSlice, pCurMb );
+
+		sDss.iCurrentPos = BsGetBitsPos(pBs);
+
+		if ( DynSlcJudgeSliceBoundaryStepBack( pEncCtx, pSlice, pSliceCtx, pCurMb, &sDss ) )//islice
+		{
+			//stack pBs pointer
+			pBs->pBufPtr		= sDss.pBsStackBufPtr;
+			pBs->uiCurBits	= sDss.uiBsStackCurBits;
+			pBs->iLeftBits	= sDss.iBsStackLeftBits;
+
+			pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx-1;	// update pLastCodedMbIdxOfPartition, need to -1 due to stepping back
+			++ pCurLayer->pNumSliceCodedOfPartition[kiPartitionId];
+
+			break;
+		}
+
+		pCurMb->uiSliceIdc = kiSliceIdx;
+
+#if defined(MB_TYPES_CHECK) 
+		WelsCountMbType( pEncCtx->sPerInfo.iMbCount, I_SLICE, pCurMb );		
+#endif//MB_TYPES_CHECK
+
+		pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate(pEncCtx,pCurMb,sMd.iCostLuma,pSlice);
+
+		++iNumMbCoded;		
+
+		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
+		//whether all of MB in current pSlice encoded or not
+		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb )
+		{
+			pSliceCtx->pCountMbNumInSlice[kiSliceIdx]	= iCurMbIdx - pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId];
+			pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx;	// update pLastCodedMbIdxOfPartition, finish coding, use iCurMbIdx directly
+			break;
+		}
+	}
+}
+
+//encapsulate two kinds of reconstruction:
+// first. store base or highest Dependency Layer with only one quality (without CS RS reconstruction)
+// second. lower than highest Dependency Layer, and for every Dependency Layer with one quality layer(single layer) 
+void WelsPSliceMdEnc( sWelsEncCtx* pEncCtx, SSlice *pSlice,  const bool_t kbIsHighestDlayerFlag ) //pMd + encoding
+{
+	const SSliceHeaderExt	*kpShExt				= &pSlice->sSliceHeaderExt;
+	const SSliceHeader		*kpSh					= &kpShExt->sSliceHeader;
+	const int32_t			kiSliceFirstMbXY	= kpSh->iFirstMbInSlice;
+	SWelsMD sMd;
+
+	sMd.uiRef			= kpSh->uiRefIndex;
+	sMd.bMdUsingSad		= kbIsHighestDlayerFlag;
+	if (!pEncCtx->pCurDqLayer->bBaseLayerAvailableFlag || !kbIsHighestDlayerFlag)
+		memset( &sMd.sMe, 0, sizeof(sMd.sMe) );
+
+	//pMb loop
+	WelsMdInterMbLoop( pEncCtx, pSlice, &sMd, kiSliceFirstMbXY );
+}
+
+void WelsPSliceMdEncDynamic( sWelsEncCtx* pEncCtx, SSlice *pSlice, const bool_t kbIsHighestDlayerFlag )
+{
+	const SSliceHeaderExt	*kpShExt				= &pSlice->sSliceHeaderExt;
+	const SSliceHeader		*kpSh					= &kpShExt->sSliceHeader;
+	const int32_t			kiSliceFirstMbXY	= kpSh->iFirstMbInSlice;
+	SWelsMD sMd;
+
+	sMd.uiRef			= kpSh->uiRefIndex;
+	sMd.bMdUsingSad		= kbIsHighestDlayerFlag;
+	if (!pEncCtx->pCurDqLayer->bBaseLayerAvailableFlag || !kbIsHighestDlayerFlag)
+		memset( &sMd.sMe, 0, sizeof(sMd.sMe) );
+
+	//mb loop
+	WelsMdInterMbLoopOverDynamicSlice( pEncCtx, pSlice, &sMd, kiSliceFirstMbXY );
+}
+
+void WelsCodePSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice )
+{
+	//pSlice-level init should be outside and before this function
+	SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
+	const bool_t kbBaseAvail		= pCurLayer->bBaseLayerAvailableFlag;
+	const bool_t kbHighestSpatial= pEncCtx->pSvcParam->iNumDependencyLayer == (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1);
+
+	//MD switch	
+	if ( kbBaseAvail && kbHighestSpatial ) 
+	{
+		//initial pMd pointer
+		pEncCtx->pFuncList->pfInterMd			=  (PInterMdFunc)WelsMdInterMbEnhancelayer;
+	}
+	else
+	{
+		//initial pMd pointer
+		pEncCtx->pFuncList->pfInterMd            =  (PInterMdFunc)WelsMdInterMb;
+	}
+	WelsPSliceMdEnc( pEncCtx, pSlice, kbHighestSpatial );
+}
+
+void WelsCodePOverDynamicSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice )
+{
+	//pSlice-level init should be outside and before this function
+	SDqLayer* pCurLayer			= pEncCtx->pCurDqLayer;
+	const bool_t kbBaseAvail		= pCurLayer->bBaseLayerAvailableFlag;
+	const bool_t kbHighestSpatial= pEncCtx->pSvcParam->iNumDependencyLayer == (pCurLayer->sLayerInfo.sNalHeaderExt.uiDependencyId + 1);
+
+	//MD switch	
+	if ( kbBaseAvail && kbHighestSpatial ) 
+	{       	
+		//initial pMd pointer
+		pEncCtx->pFuncList->pfInterMd			=  (PInterMdFunc)WelsMdInterMbEnhancelayer;
+	}
+	else
+	{
+		//initial pMd pointer
+		pEncCtx->pFuncList->pfInterMd            =  (PInterMdFunc)WelsMdInterMb;		
+	}
+	WelsPSliceMdEncDynamic( pEncCtx, pSlice, kbHighestSpatial );
+}
+
+// 1st index: 0: for P pSlice; 1: for I pSlice;
+// 2nd index: 0: for non-dynamic pSlice; 1: for dynamic I pSlice;
+PWelsCodingSliceFunc	g_pWelsSliceCoding[2][2] =
+{
+	{ WelsCodePSlice, WelsCodePOverDynamicSlice },	// P SSlice
+	{ WelsISliceMdEnc, WelsISliceMdEncDynamic }	// I SSlice
+};
+PWelsSliceHeaderWriteFunc		g_pWelsWriteSliceHeader[2] =	// 0: for base; 1: for ext;
+{
+	WelsSliceHeaderWrite,
+	WelsSliceHeaderExtWrite
+};
+
+
+void WelsCodeOneSlice( sWelsEncCtx* pEncCtx, const int32_t kiSliceIdx, const int32_t kiNalType )
+{	
+	SDqLayer* pCurLayer					= pEncCtx->pCurDqLayer;
+	SNalUnitHeaderExt* pNalHeadExt	= &pCurLayer->sLayerInfo.sNalHeaderExt;
+	SSlice *pCurSlice					= &pCurLayer->sLayerInfo.pSliceInLayer[kiSliceIdx];
+	SBitStringAux* pBs					= pCurSlice->pSliceBsa;
+	const int32_t kiDynamicSliceFlag	= (pEncCtx->pSvcParam->sDependencyLayers[pEncCtx->uiDependencyId].sMso.uiSliceMode == SM_DYN_SLICE);
+
+	assert( kiSliceIdx == pCurSlice->uiSliceIdx );
+
+	if ( I_SLICE == pEncCtx->eSliceType )
+	{
+		pNalHeadExt->bIdrFlag = 1;
+		pCurSlice->sScaleShift = 0;
+	}
+	else
+	{
+		const uint32_t kuiTemporalId = pNalHeadExt->uiTemporalId;
+		pCurSlice->sScaleShift = kuiTemporalId ? (kuiTemporalId - pEncCtx->pRefPic->uiTemporalId) : 0;
+	}
+
+	WelsSliceHeaderExtInit( pEncCtx, pCurLayer, pCurSlice );	
+
+
+	g_pWelsWriteSliceHeader[pCurSlice->bSliceHeaderExtFlag]( pBs, pCurLayer, pCurSlice, &(pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[0]) );
+#if _DEBUG 
+	if ( pEncCtx->sPSOVector.bEnableSpsPpsIdAddition )
+	{
+		const int32_t kiEncoderPpsId    = pCurSlice->sSliceHeaderExt.sSliceHeader.pPps->iPpsId;
+		const int32_t kiTmpPpsIdInBs = kiEncoderPpsId + pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].iParaSetIdDelta[ kiEncoderPpsId ];
+		assert ( MAX_PPS_COUNT > kiTmpPpsIdInBs );
+		
+		//when activated need to sure there is avialable PPS
+		assert ( pEncCtx->sPSOVector.sParaSetOffsetVariable[PARA_SET_TYPE_PPS].bUsedParaSetIdInBs[kiTmpPpsIdInBs] );
+	}
+#endif
+
+	pCurSlice->uiLastMbQp = pCurLayer->sLayerInfo.pPpsP->iPicInitQp + pCurSlice->sSliceHeaderExt.sSliceHeader.iSliceQpDelta;	
+
+	g_pWelsSliceCoding[pNalHeadExt->bIdrFlag][kiDynamicSliceFlag]( pEncCtx, pCurSlice );
+
+	BsRbspTrailingBits( pBs );
+
+	BsFlush( pBs );
+}
+
+//pFunc: UpdateMbNeighbourInfoForNextSlice()
+void UpdateMbNeighbourInfoForNextSlice(	SSliceCtx *pSliceCtx,
+											 SMB *pMbList,
+											 const int32_t kiFirstMbIdxOfNextSlice,
+											 const int32_t kiLastMbIdxInPartition	)
+{	
+	const int32_t kiMbWidth					= pSliceCtx->iMbWidth;
+	int32_t iIdx								= kiFirstMbIdxOfNextSlice;
+	int32_t	iNextSliceFirstMbIdxRowStart= (( kiFirstMbIdxOfNextSlice % kiMbWidth ) ? 1:0);
+	int32_t iCountMbUpdate					= kiMbWidth + iNextSliceFirstMbIdxRowStart; //need to update MB(iMbXY+1) to MB(iMbXY+1+row) in common case
+	const int32_t kiEndMbNeedUpdate		= kiFirstMbIdxOfNextSlice + iCountMbUpdate;
+	SMB *pMb									= &pMbList[iIdx];
+	
+	do {
+        uint32_t uiNeighborAvailFlag	= 0;
+		const int32_t kiMbXY				= pMb->iMbXY;
+		const int32_t kiMbX				= pMb->iMbX;
+		const int32_t kiMbY				= pMb->iMbY;
+		BOOL_T     bLeft;
+		BOOL_T     bTop;
+		BOOL_T     bLeftTop;
+		BOOL_T     bRightTop;		
+		int32_t   iLeftXY, iTopXY, iLeftTopXY, iRightTopXY;
+		const uint8_t  kuiSliceIdc		= WelsMbToSliceIdc(pSliceCtx, kiMbXY);
+		
+		pMb->uiSliceIdc	= kuiSliceIdc;
+		iLeftXY = kiMbXY - 1;
+		iTopXY = kiMbXY - kiMbWidth;
+		iLeftTopXY = iTopXY - 1;
+		iRightTopXY = iTopXY + 1;
+		
+		bLeft = (kiMbX > 0) && (kuiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftXY));
+		bTop = (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iTopXY));
+		bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iLeftTopXY));
+		bRightTop = (kiMbX < (kiMbWidth-1)) && (kiMbY > 0) && (kuiSliceIdc == WelsMbToSliceIdc(pSliceCtx, iRightTopXY));		
+		
+		if( bLeft ){
+			uiNeighborAvailFlag |= LEFT_MB_POS;
+		}
+		if( bTop ){
+			uiNeighborAvailFlag |= TOP_MB_POS;
+		}
+		if( bLeftTop ){
+			uiNeighborAvailFlag |= TOPLEFT_MB_POS;
+		}
+		if( bRightTop ){
+			uiNeighborAvailFlag |= TOPRIGHT_MB_POS;
+		}
+		pMb->uiNeighborAvail	= (uint8_t)uiNeighborAvailFlag;
+		
+		++ pMb;
+		++ iIdx;
+	}while (	( iIdx < kiEndMbNeedUpdate) && 
+				( iIdx <= kiLastMbIdxInPartition ) );
+} 
+
+
+void AddSliceBoundary(sWelsEncCtx* pEncCtx, SSlice * pCurSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, int32_t iFirstMbIdxOfNextSlice, const int32_t kiLastMbIdxInPartition )
+{
+	SDqLayer*	pCurLayer = pEncCtx->pCurDqLayer;
+	int32_t		iCurMbIdx		= pCurMb->iMbXY;
+	int32_t		iCurSliceIdc	= pSliceCtx->pOverallMbMap[ iCurMbIdx ];
+	const int32_t kiSliceIdxStep= pEncCtx->iActiveThreadsNum;
+	int32_t		iNextSliceIdc	= iCurSliceIdc + kiSliceIdxStep;
+	SSlice		*pNextSlice		= NULL;
+
+	SMB *pMbList					= pCurLayer->sMbDataP;	
+
+	//update cur pSlice info 	
+	pCurSlice->sSliceHeaderExt.uiNumMbsInSlice	= 1 + iCurMbIdx - pCurSlice->sSliceHeaderExt.sSliceHeader.iFirstMbInSlice;
+	
+	//pNextSlice pointer/initialization
+		pNextSlice = &( pCurLayer->sLayerInfo.pSliceInLayer[ iNextSliceIdc ] );
+
+#if _DEBUG
+	assert( NULL != pNextSlice );
+	// now ( pSliceCtx->iSliceNumInFrame < pSliceCtx->iMaxSliceNumConstraint ) always true by the call of this pFunc
+#endif
+
+	//init next pSlice info
+	pNextSlice->bSliceHeaderExtFlag = 
+		(NAL_UNIT_CODED_SLICE_EXT == pCurLayer->sLayerInfo.sNalHeaderExt.sNalHeader.eNalUnitType);
+	memcpy( &pNextSlice->sSliceHeaderExt, &pCurSlice->sSliceHeaderExt, sizeof(SSliceHeaderExt) );	// confirmed_safe_unsafe_usage
+
+	pSliceCtx->pFirstMbInSlice[iNextSliceIdc] = iFirstMbIdxOfNextSlice;
+
+#if !defined(MT_ENABLED)
+	pNextSlice->uiSliceIdx = iNextSliceIdc;
+	pNextSlice->pSliceBsa = &(pEncCtx->pOut->sBsWrite);
+#endif//!MT_ENABLED
+
+	memset(pSliceCtx->pOverallMbMap+iFirstMbIdxOfNextSlice, (uint8_t)iNextSliceIdc, (kiLastMbIdxInPartition-iFirstMbIdxOfNextSlice+1)*sizeof(uint8_t));
+
+	//DYNAMIC_SLICING_ONE_THREAD: update pMbList slice_neighbor_info
+	UpdateMbNeighbourInfoForNextSlice( pSliceCtx, pMbList, iFirstMbIdxOfNextSlice, kiLastMbIdxInPartition );
+}
+
+BOOL_T DynSlcJudgeSliceBoundaryStepBack(void* pCtx, void *pSlice, SSliceCtx *pSliceCtx, SMB* pCurMb, SDynamicSlicingStack* pDss )
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx*)pCtx;
+	SSlice * pCurSlice = (SSlice *)pSlice;
+	int32_t		   iCurMbIdx  = pCurMb->iMbXY;
+	uint32_t        uiLen = 0;
+	int32_t		   iPosBitOffset = 0;
+	const int32_t  kiActiveThreadsNum = pEncCtx->iActiveThreadsNum;
+	const int32_t  kiPartitaionId = pCurSlice->uiSliceIdx % kiActiveThreadsNum;
+	const int32_t  kiLastMbIdxInPartition	= pEncCtx->pCurDqLayer->pLastMbIdxOfPartition[kiPartitaionId];
+
+	const BOOL_T    kbCurMbNotFirstMbOfCurSlice      = (pSliceCtx->pOverallMbMap[iCurMbIdx] == pSliceCtx->pOverallMbMap[iCurMbIdx-1]);
+	const BOOL_T    kbCurMbNotLastMbOfCurPartition = iCurMbIdx < kiLastMbIdxInPartition;
+	const BOOL_T    kbSliceNumNotExceedConstraint       = pSliceCtx->iSliceNumInFrame < pSliceCtx->iMaxSliceNumConstraint; /*tmp choice to avoid complex memory operation, 100520, to be modify*/
+	const BOOL_T    kbSliceNumReachConstraint               = (pSliceCtx->iSliceNumInFrame == pSliceCtx->iMaxSliceNumConstraint);
+
+	if ( pCurSlice->bDynamicSlicingSliceSizeCtrlFlag ) 
+		return false;
+
+	iPosBitOffset = ( pDss->iCurrentPos - pDss->iStartPos );
+#if _DEBUG
+	assert(iPosBitOffset>=0);
+#endif
+	uiLen = ( ( iPosBitOffset>>3 ) + (( iPosBitOffset & 0x07 )? 1: 0) );	
+
+#ifdef MT_ENABLED
+	if ( pEncCtx->pSvcParam->iMultipleThreadIdc > 1 )
+		WelsMutexLock( &pEncCtx->pSliceThreading->mutexSliceNumUpdate );
+#endif//MT_ENABLED
+
+	//DYNAMIC_SLICING_ONE_THREAD: judge jump_avoiding_pack_exceed
+	if (
+		( ( kbCurMbNotFirstMbOfCurSlice
+		&& JUMPPACKETSIZE_JUDGE(uiLen,iCurMbIdx,pSliceCtx->uiSliceSizeConstraint) )/*jump_avoiding_pack_exceed*/ 
+		&& kbCurMbNotLastMbOfCurPartition )//decide to add new pSlice
+		&& ( kbSliceNumNotExceedConstraint
+#ifdef MT_ENABLED
+		&& ( ( pCurSlice->uiSliceIdx + kiActiveThreadsNum ) < pSliceCtx->iMaxSliceNumConstraint )
+#endif//MT_ENABLED	
+		)//able to add new pSlice
+
+		)
+	{	
+		
+		AddSliceBoundary( pEncCtx, pCurSlice, pSliceCtx, pCurMb, iCurMbIdx, kiLastMbIdxInPartition );
+
+		++ pSliceCtx->iSliceNumInFrame;
+
+#ifdef MT_ENABLED
+		if (pEncCtx->pSvcParam->iMultipleThreadIdc > 1)
+			WelsMutexUnlock( &pEncCtx->pSliceThreading->mutexSliceNumUpdate );
+#endif//MT_ENABLED
+
+		return TRUE;
+	}
+
+	if (
+		( kbSliceNumReachConstraint
+#ifdef MT_ENABLED
+		|| ( ( pCurSlice->uiSliceIdx + kiActiveThreadsNum ) >= pSliceCtx->iMaxSliceNumConstraint )
+#endif//MT_ENABLED
+		)
+		&& ( ( JUMPPACKETSIZE_JUDGE(uiLen,	iCurMbIdx,
+		pSliceCtx->uiSliceSizeConstraint - ( ( kiLastMbIdxInPartition - iCurMbIdx ) << ( pCurSlice->uiAssumeLog2BytePerMb ) /* assume each MB consumes two byte under largest QP */) ) )
+		&& kbCurMbNotLastMbOfCurPartition )//risk of exceeding the size constraint when pSlice num reaches constraint
+		)
+	{		
+		pCurSlice->bDynamicSlicingSliceSizeCtrlFlag = true;
+	}
+
+#ifdef MT_ENABLED
+	if (pEncCtx->pSvcParam->iMultipleThreadIdc > 1)
+		WelsMutexUnlock( &pEncCtx->pSliceThreading->mutexSliceNumUpdate );
+#endif//MT_ENABLED
+
+	return FALSE;
+}
+
+///////////////
+//  pMb loop
+///////////////
+// for inter non-dynamic pSlice
+void WelsMdInterMbLoop( sWelsEncCtx* pEncCtx, SSlice *pSlice, void* pWelsMd, const int32_t kiSliceFirstMbXY )
+{
+	SWelsMD* pMd					= (SWelsMD*)pWelsMd;
+	SBitStringAux* pBs			= pSlice->pSliceBsa;
+	SDqLayer *pCurLayer			= pEncCtx->pCurDqLayer;
+	SSliceCtx *pSliceCtx	= pCurLayer->pSliceEncCtx;
+	SMbCache *pMbCache			= &pSlice->sMbCacheInfo;
+	SMB *pMbList					= pCurLayer->sMbDataP;
+	SMB *pCurMb					= NULL;
+	int32_t iNumMbCoded		= 0;
+	int32_t	iNextMbIdx			= kiSliceFirstMbXY;
+	int32_t	iCurMbIdx			= -1;	
+	int32_t	iMbSkipRun			= 0;
+	const int32_t kiTotalNumMb	= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
+	const int32_t kiMvdInterTableSize	= (pEncCtx->pSvcParam->iNumDependencyLayer == 1 ? 648: 972);
+	const int32_t kiMvdInterTableStride= 1+(kiMvdInterTableSize<<1);
+	uint16_t *pMvdCostTableInter		= &pEncCtx->pMvdCostTableInter[kiMvdInterTableSize];
+	const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
+	const uint8_t kuiChromaQpIndexOffset= pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
+
+	for(;;)
+	{
+		//point to current pMb
+		iCurMbIdx	= iNextMbIdx;
+		pCurMb = &pMbList[ iCurMbIdx ];		
+
+		//step(1): set QP for the current MB
+		pEncCtx->pFuncList->pfRc.pfWelsRcMbInit(pEncCtx, pCurMb, pSlice);
+		
+        //step (2). save some vale for future use, initial pWelsMd
+		pMd->iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
+		pMd->pMvdCost = &pMvdCostTableInter[pCurMb->uiLumaQp*kiMvdInterTableStride];
+		WelsMdIntraInit(pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY);
+        WelsMdInterInit(pEncCtx, pSlice, pCurMb, kiSliceFirstMbXY);
+		pEncCtx->pFuncList->pfInterMd(pEncCtx, pMd, pSlice, pCurMb, pMbCache);
+		//mb_qp
+
+		//step (4): save from the MD process from future use
+		WelsMdInterSaveSadAndRefMbType( (pCurLayer->pDecPic->uiRefMbType), pMbCache, pCurMb, pMd);
+
+		pEncCtx->pFuncList->pfInterMdBackgroundInfoUpdate( pCurLayer, pCurMb, pMbCache->bCollocatedPredFlag, pEncCtx->pRefPic->iPictureType );
+
+		//step (5): update cache
+		UpdateNonZeroCountCache( pCurMb, pMbCache );
+
+		//step (6): begin to write bit stream; if the pSlice size is controlled, the writing may be skipped
+		if( IS_SKIP (pCurMb->uiMbType) )
+		{
+			pCurMb->uiLumaQp	= pSlice->uiLastMbQp;
+			pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+			
+			iMbSkipRun++;
+		}
+		else
+		{
+			BsWriteUE( pBs, iMbSkipRun );
+			iMbSkipRun = 0;
+			WelsSpatialWriteMbSyn( pEncCtx, pSlice, pCurMb );
+		}
+		
+		//step (7): reconstruct current MB
+		pCurMb->uiSliceIdc = kiSliceIdx;
+		OutputPMbWithoutConstructCsRsNoCopy( pEncCtx, pCurLayer, pSlice, pCurMb );
+		
+        #if defined(MB_TYPES_CHECK) 
+		WelsCountMbType( pEncCtx->sPerInfo.iMbCount, P_SLICE, pCurMb );		
+        #endif//MB_TYPES_CHECK			
+
+		//step (8): update status and other parameters
+		pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate(pEncCtx,pCurMb,pMd->iCostLuma,pSlice);
+		
+		/*judge if all pMb in cur pSlice has been encoded*/
+		++ iNumMbCoded;
+		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
+		//whether all of MB in current pSlice encoded or not
+		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb )
+		{
+			break;
+		}
+	}
+
+	if ( iMbSkipRun )
+	{
+		BsWriteUE( pBs, iMbSkipRun );
+	}
+}
+
+// Only for inter dynamic slicing
+void WelsMdInterMbLoopOverDynamicSlice( sWelsEncCtx* pEncCtx, SSlice *pSlice, void* pWelsMd, const int32_t kiSliceFirstMbXY )
+{
+	SWelsMD* pMd					= (SWelsMD*)pWelsMd;
+	SBitStringAux* pBs			= pSlice->pSliceBsa;
+	SDqLayer *pCurLayer			= pEncCtx->pCurDqLayer;
+	SSliceCtx *pSliceCtx	= pCurLayer->pSliceEncCtx;
+	SMbCache *pMbCache			= &pSlice->sMbCacheInfo;
+	SMB *pMbList					= pCurLayer->sMbDataP;
+	SMB *pCurMb					= NULL;
+	int32_t iNumMbCoded		= 0;
+	const int32_t kiTotalNumMb	= pCurLayer->iMbWidth * pCurLayer->iMbHeight;
+	int32_t	iNextMbIdx			= kiSliceFirstMbXY;
+	int32_t	iCurMbIdx			= -1;
+	int32_t	iMbSkipRun			= 0;	
+	const int32_t kiMvdInterTableSize	= (pEncCtx->pSvcParam->iNumDependencyLayer == 1 ? 648: 972);
+	const int32_t kiMvdInterTableStride= 1+(kiMvdInterTableSize<<1);
+	uint16_t *pMvdCostTableInter		= &pEncCtx->pMvdCostTableInter[kiMvdInterTableSize];
+	const int32_t kiSliceIdx				= pSlice->uiSliceIdx;
+	const int32_t kiPartitionId			= (kiSliceIdx % pEncCtx->iActiveThreadsNum);
+	const uint8_t kuiChromaQpIndexOffset= pCurLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset;
+
+	SDynamicSlicingStack sDss;
+	sDss.iStartPos = BsGetBitsPos(pBs);
+	for(;;)
+	{
+		//point to current pMb
+		iCurMbIdx	= iNextMbIdx;
+		pCurMb = &pMbList[ iCurMbIdx ];		
+
+		//step(1): set QP for the current MB
+		pEncCtx->pFuncList->pfRc.pfWelsRcMbInit(pEncCtx, pCurMb, pSlice);
+		// if already reaches the largest number of slices, set QPs to the upper bound
+		if (pSlice->bDynamicSlicingSliceSizeCtrlFlag)
+		{
+			//a clearer logic may be: 
+			//if there is no need from size control from the pSlice size, the QP will be decided by RC; else it will be set to the max QP
+			//    however, there are some parameter updating in the rc_mb_init() function, so it cannot be skipped?
+			pCurMb->uiLumaQp = pEncCtx->pWelsSvcRc[pEncCtx->uiDependencyId].iMaxQp;
+			pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+		}
+
+		//step (2). save some vale for future use, initial pWelsMd
+		pMd->iLambda = g_kiQpCostTable[pCurMb->uiLumaQp];
+		pMd->pMvdCost = &pMvdCostTableInter[pCurMb->uiLumaQp*kiMvdInterTableStride];
+		
+		WelsMdIntraInit(pEncCtx, pCurMb, pMbCache, kiSliceFirstMbXY);
+		WelsMdInterInit(pEncCtx, pSlice, pCurMb, kiSliceFirstMbXY);
+		pEncCtx->pFuncList->pfInterMd(pEncCtx, pMd, pSlice, pCurMb, pMbCache);
+		//mb_qp
+
+		//step (4): save from the MD process from future use
+		WelsMdInterSaveSadAndRefMbType( (pCurLayer->pDecPic->uiRefMbType), pMbCache, pCurMb, pMd);
+
+		pEncCtx->pFuncList->pfInterMdBackgroundInfoUpdate( pCurLayer, pCurMb, pMbCache->bCollocatedPredFlag, pEncCtx->pRefPic->iPictureType );
+
+		//step (5): update cache
+		UpdateNonZeroCountCache( pCurMb, pMbCache );
+
+		//step (6): begin to write bit stream; if the pSlice size is controlled, the writing may be skipped
+
+		//DYNAMIC_SLICING_ONE_THREAD - MultiD
+		//stack pBs pointer
+		sDss.pBsStackBufPtr	= pBs->pBufPtr;
+		sDss.uiBsStackCurBits	= pBs->uiCurBits;
+		sDss.iBsStackLeftBits	= pBs->iLeftBits;
+		//stack Pskip status
+		sDss.iMbSkipRunStack = iMbSkipRun;
+		//DYNAMIC_SLICING_ONE_THREAD - MultiD
+
+		if( IS_SKIP (pCurMb->uiMbType) )
+		{
+			pCurMb->uiLumaQp	= pSlice->uiLastMbQp;
+			pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + kuiChromaQpIndexOffset)];
+
+			iMbSkipRun++;
+		}
+		else
+		{
+			BsWriteUE( pBs, iMbSkipRun );
+			iMbSkipRun = 0;
+			WelsSpatialWriteMbSyn( pEncCtx, pSlice, pCurMb );
+		}		
+
+		//DYNAMIC_SLICING_ONE_THREAD - MultiD
+		sDss.iCurrentPos = BsGetBitsPos(pBs);
+		if ( DynSlcJudgeSliceBoundaryStepBack( pEncCtx, pSlice, pSliceCtx, pCurMb, &sDss ) )
+		{
+			//stack pBs pointer
+			pBs->pBufPtr		= sDss.pBsStackBufPtr;
+			pBs->uiCurBits	= sDss.uiBsStackCurBits;
+			pBs->iLeftBits	= sDss.iBsStackLeftBits;
+
+			iMbSkipRun = sDss.iMbSkipRunStack;
+
+			pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx-1;	// update pLastCodedMbIdxOfPartition, need to -1 due to stepping back
+			++ pCurLayer->pNumSliceCodedOfPartition[kiPartitionId];
+
+			break;
+		}
+
+		//step (7): reconstruct current MB
+		pCurMb->uiSliceIdc = kiSliceIdx;
+		OutputPMbWithoutConstructCsRsNoCopy( pEncCtx, pCurLayer, pSlice, pCurMb );
+
+#if defined(MB_TYPES_CHECK) 
+		WelsCountMbType( pEncCtx->sPerInfo.iMbCount, P_SLICE, pCurMb );		
+#endif//MB_TYPES_CHECK			
+
+		//step (8): update status and other parameters
+		pEncCtx->pFuncList->pfRc.pfWelsRcMbInfoUpdate(pEncCtx,pCurMb,pMd->iCostLuma,pSlice);
+
+		/*judge if all pMb in cur pSlice has been encoded*/
+		++ iNumMbCoded;
+		iNextMbIdx = WelsGetNextMbOfSlice( pSliceCtx, iCurMbIdx );
+		//whether all of MB in current pSlice encoded or not
+		if ( iNextMbIdx == -1 || iNextMbIdx >= kiTotalNumMb || iNumMbCoded >= kiTotalNumMb )
+		{
+			pCurLayer->pLastCodedMbIdxOfPartition[kiPartitionId] = iCurMbIdx;	// update pLastCodedMbIdxOfPartition, finish coding, use pCurMb_idx directly				
+			break;
+		}
+	}
+
+	if ( iMbSkipRun )
+	{
+		BsWriteUE( pBs, iMbSkipRun );
+	}
+}
+
+}//namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/svc_mode_decision.cpp
@@ -1,0 +1,189 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_mode_decision.c
+ *
+ * \brief	SVC Spatial Enhancement Layer MD
+ *
+ * \date	2009.7.29
+ *
+		  
+ **************************************************************************************
+ */
+#include <assert.h>
+#include <string.h>
+#include "decode_mb_aux.h"
+#include "svc_enc_golomb.h"
+#include "ls_defines.h"
+#include "md.h"
+#include "mv_pred.h"
+#include "sample.h"
+#include "svc_base_layer_md.h"
+#include "svc_encode_mb.h"
+#include "svc_encode_slice.h"
+#include "mb_cache.h"
+
+#include "svc_mode_decision.h"
+#include "svc_motion_estimate.h"
+
+#include "svc_set_mb_syn_cavlc.h"
+#include "cpu_core.h"
+#include "encode_mb_aux.h"
+#include "utils.h"
+namespace WelsSVCEnc {
+
+//
+// md in enhancement layer
+///
+void WelsMdSpatialelInterMbIlfmdNoilp(	sWelsEncCtx* pEncCtx, SWelsMD* pWelsMd, SSlice *pSlice,
+										    SMB* pCurMb, const Mb_Type kuiRefMbType)
+{
+	SDqLayer* pCurDqLayer = pEncCtx->pCurDqLayer;
+	SMbCache *pMbCache = &pSlice->sMbCacheInfo;
+
+	const uint32_t kuiNeighborAvail = pCurMb->uiNeighborAvail;
+	const int32_t kiMbWidth = pCurDqLayer->iMbWidth;
+	const  SMB* kpTopMb = pCurMb-kiMbWidth;
+	const bool_t kbMbLeftAvailPskip	= ((kuiNeighborAvail&LEFT_MB_POS) ? IS_SKIP((pCurMb-1)->uiMbType) : false );
+	const bool_t kbMbTopAvailPskip			= ((kuiNeighborAvail&TOP_MB_POS) ? IS_SKIP(kpTopMb->uiMbType) : false );
+	const bool_t kbMbTopLeftAvailPskip		= ((kuiNeighborAvail&TOPLEFT_MB_POS) ? IS_SKIP((kpTopMb -1)->uiMbType) : false );
+	const bool_t kbMbTopRightAvailPskip	= ((kuiNeighborAvail&TOPRIGHT_MB_POS) ? IS_SKIP((kpTopMb +1)->uiMbType) : false );
+
+	BOOL_T bTrySkip  = kbMbLeftAvailPskip|kbMbTopAvailPskip|kbMbTopLeftAvailPskip|kbMbTopRightAvailPskip;
+	BOOL_T bKeepSkip = kbMbLeftAvailPskip&kbMbTopAvailPskip&kbMbTopRightAvailPskip;
+	BOOL_T bSkip = FALSE;
+
+	if ( pEncCtx->pFuncList->pfInterMdBackgroundDecision( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, &bKeepSkip ) )
+	{
+		return;
+	}
+
+	//step 1: try SKIP
+	bSkip = WelsMdInterJudgePskip( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bTrySkip ); 
+
+	if (  bSkip && bKeepSkip )
+	{
+		WelsMdInterDecidedPskip(pEncCtx,  pSlice,  pCurMb, pMbCache);
+		return;
+	}
+
+	if ( ! IS_SVC_INTRA(kuiRefMbType) )
+	{
+		if ( !bSkip )
+		{	
+			PredictSad( pMbCache->sMvComponents.iRefIndexCache, pMbCache->iSadCost, 0, &pWelsMd->iSadPredMb );
+			
+			//step 2: P_16x16
+			pWelsMd->iCostLuma = WelsMdP16x16(pEncCtx->pFuncList, pCurDqLayer, pWelsMd, pSlice, pCurMb);
+			pCurMb->uiMbType = MB_TYPE_16x16;
+		}
+		
+		WelsMdInterSecondaryModesEnc( pEncCtx, pWelsMd, pSlice, pCurMb, pMbCache, bSkip );
+	}
+	else //BLMODE == SVC_INTRA
+	{
+		//initial prediction memory for I_16x16
+		const int32_t kiCostI16x16 = WelsMdI16x16(pEncCtx->pFuncList, pEncCtx->pCurDqLayer, pMbCache, pWelsMd->iLambda);
+		if ( bSkip && (pWelsMd->iCostLuma <= kiCostI16x16) )
+		{
+			WelsMdInterDecidedPskip(pEncCtx,  pSlice,  pCurMb, pMbCache);
+		}
+		else
+		{
+			pWelsMd->iCostLuma = kiCostI16x16;		
+			pCurMb->uiMbType = MB_TYPE_INTRA16x16;
+			
+			WelsMdIntraSecondaryModesEnc( pEncCtx, pWelsMd, pCurMb, pMbCache );
+		}			
+	}		
+}
+
+
+
+void WelsMdInterMbEnhancelayer( void* pEnc, void* pMd, SSlice *pSlice, SMB* pCurMb, SMbCache *pMbCache )
+{
+	sWelsEncCtx *pEncCtx	= (sWelsEncCtx*)pEnc;
+	SDqLayer *pCurLayer				= pEncCtx->pCurDqLayer;
+	SWelsMD *pWelsMd					= (SWelsMD*)pMd;
+	const SMB* kpInterLayerRefMb		= GetRefMb( pCurLayer, pCurMb );
+	const Mb_Type kuiInterLayerRefMbType	= kpInterLayerRefMb->uiMbType;
+
+	SetMvBaseEnhancelayer( pWelsMd, pCurMb, kpInterLayerRefMb );// initial sMvBase here only when pRef mb type is inter, if not sMvBase will be not used! 	
+	//step (3): do the MD process
+	WelsMdSpatialelInterMbIlfmdNoilp(pEncCtx, pWelsMd, pSlice, pCurMb, kuiInterLayerRefMbType);//MD process
+}
+
+//////////////////////////
+//
+//SUPPORTING FUNCS
+//
+//////////////////////////
+
+///////////////////////
+// do initiation for noILP (needed by ILFMD)
+////////////////////////
+
+SMB* GetRefMb( SDqLayer *pCurLayer, SMB *pCurMb )
+{
+    const SDqLayer  *kpRefLayer		= pCurLayer->pRefLayer;
+	const int32_t  kiRefMbIdx = (pCurMb->iMbY>>1) * kpRefLayer->iMbWidth + (pCurMb->iMbX>>1);//because current lower layer is half size on both vertical and horizontal
+	return (&kpRefLayer->sMbDataP[kiRefMbIdx]);    
+}
+
+void SetMvBaseEnhancelayer( SWelsMD* pMd, SMB *pCurMb, const SMB *kpRefMb )
+{
+	const Mb_Type kuiRefMbType = kpRefMb->uiMbType;
+
+	if ( ! IS_SVC_INTRA( kuiRefMbType ))
+	{
+        SMVUnitXY sMv;
+        int32_t iRefMbPartIdx = ((pCurMb->iMbY&0x01)<<1) + (pCurMb->iMbX&0x01); //may be need modified
+        int32_t iScan4RefPartIdx = g_kuiMbCountScan4Idx[(iRefMbPartIdx<<2)];	
+        sMv.iMvX = kpRefMb->sMv[iScan4RefPartIdx].iMvX << 1;
+        sMv.iMvY = kpRefMb->sMv[iScan4RefPartIdx].iMvY << 1;
+
+		pMd->sMe.sMe16x16.sMvBase = sMv;
+
+		pMd->sMe.sMe8x8[0].sMvBase =
+		pMd->sMe.sMe8x8[1].sMvBase =
+		pMd->sMe.sMe8x8[2].sMvBase =
+		pMd->sMe.sMe8x8[3].sMvBase = sMv;
+        
+ 		pMd->sMe.sMe16x8[0].sMvBase =
+ 		pMd->sMe.sMe16x8[1].sMvBase =
+		pMd->sMe.sMe8x16[0].sMvBase =
+ 		pMd->sMe.sMe8x16[1].sMvBase = sMv; 				
+	}
+}
+
+
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -1,0 +1,253 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc motion estimate.c
+ *
+ * \brief	Interfaces introduced in svc mb motion estimation
+ *
+ * \date	08/11/2009 Created
+ *
+ *************************************************************************************
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "svc_motion_estimate.h"
+#include "svc_enc_golomb.h"
+#include "macros.h"
+#include "sample.h"
+#include "array_stack_align.h"
+#include "cpu_core.h"	// WELS_CPU_SSE41
+
+namespace WelsSVCEnc {	
+/*!
+ * \brief	BL mb motion estimate search
+ *
+ * \param	enc			Wels encoder context
+ * \param	pMe	        Wels me information
+ *
+ * \return	NONE
+ */
+
+void WelsMotionEstimateSearchSatd (SWelsFuncPtrList *pFuncList, void* pLplayer, void* pLpme, void* pLpslice)
+{
+	SDqLayer* pCurDqLayer			= (SDqLayer *)pLplayer;
+	SWelsME* pMe						= (SWelsME *)pLpme;
+	SSlice* pSlice					= (SSlice *)pLpslice;
+	int32_t iStrideEnc = pCurDqLayer->iEncStride[0];
+	int32_t iStrideRef = pCurDqLayer->pRefPic->iLineSize[0];
+
+	//  Step 1: Initial point prediction
+	WelsMotionEstimateInitialPoint ( pFuncList, pMe, pSlice, iStrideEnc, iStrideRef );
+
+	pMe->uSadPredISatd.uiSatd = pFuncList->sSampleDealingFuncs.pfSampleSatd[pMe->uiPixel]( pMe->pEncMb, iStrideEnc, pMe->pRefMb, iStrideRef );
+	pMe->uiSatdCost = pMe->uSadPredISatd.uiSatd + COST_MVD(pMe->pMvdCost, pMe->sMv.iMvX - pMe->sMvp.iMvX, pMe->sMv.iMvY - pMe->sMvp.iMvY);	
+}
+
+
+void WelsMotionEstimateSearchSad (SWelsFuncPtrList *pFuncList, void* pLplayer, void* pLpme, void* pLpslice)
+{
+	SDqLayer* pCurDqLayer			= (SDqLayer *)pLplayer;
+	SWelsME* pMe						= (SWelsME *)pLpme;
+	SSlice* slice					= (SSlice *)pLpslice;
+	int32_t iStrideEnc			= pCurDqLayer->iEncStride[0];
+	int32_t iStrideRef			= pCurDqLayer->pRefPic->iLineSize[0];
+
+	//  Step 1: Initial point prediction
+	WelsMotionEstimateInitialPoint ( pFuncList, pMe, slice, iStrideEnc, iStrideRef );
+}
+
+/*!
+ * \brief	EL mb motion estimate initial point testing
+ *
+ * \param	pix_pFuncList	SSampleDealingFunc
+ * \param	pMe	        Wels me information
+ * \param	mv_range	search range in motion estimate
+ * \param	point	    the best match point in motion estimation
+ *
+ * \return	NONE
+ */
+void WelsMotionEstimateInitialPoint(SWelsFuncPtrList *pFuncList, SWelsME * pMe, SSlice *pSlice, int32_t iStrideEnc, int32_t iStrideRef )
+{   
+	PSampleSadSatdCostFunc pSad		= pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiPixel];
+	const uint16_t *kpMvdCost	= pMe->pMvdCost;
+	uint8_t* const kpEncMb		= pMe->pEncMb;	
+	int16_t iMvc0, iMvc1;
+	int32_t iSadCost;
+	int32_t iBestSadCost;
+	uint8_t *pRefMb;
+	uint8_t *pFref2;
+	uint32_t i;
+	const uint32_t kuiMvcNum		= pSlice->uiMvcNum;
+	const SMVUnitXY *kpMvcList	= &pSlice->sMvc[0];
+	const SMVUnitXY ksMvMin		= pSlice->sMvMin;
+	const SMVUnitXY ksMvMax		= pSlice->sMvMax;
+	const SMVUnitXY ksMvp		= pMe->sMvp;
+	SMVUnitXY sMv;
+	
+	//  Step 1: Initial point prediction
+    // init with sMvp
+	sMv.iMvX	= WELS_CLIP3( (2 + ksMvp.iMvX) >> 2, ksMvMin.iMvX, ksMvMax.iMvX );
+	sMv.iMvY	= WELS_CLIP3( (2 + ksMvp.iMvY) >> 2, ksMvMin.iMvY, ksMvMax.iMvY );
+
+    pRefMb = &pMe->pRefMb[sMv.iMvY * iStrideRef + sMv.iMvX];
+	
+	iBestSadCost = pSad( kpEncMb, iStrideEnc, pRefMb, iStrideRef );
+    iBestSadCost += COST_MVD(kpMvdCost, ((sMv.iMvX)<<2) - ksMvp.iMvX, ((sMv.iMvY)<<2) - ksMvp.iMvY);
+	
+	for (i = 0; i < kuiMvcNum; i++)
+	{
+		//clipping here is essential since some pOut-of-range MVC may happen here (i.e., refer to baseMV)
+		iMvc0 = WELS_CLIP3( ( 2 + kpMvcList[i].iMvX ) >> 2, ksMvMin.iMvX, ksMvMax.iMvX );
+		iMvc1 = WELS_CLIP3( ( 2 + kpMvcList[i].iMvY ) >> 2, ksMvMin.iMvY, ksMvMax.iMvY );
+		
+		if( ((iMvc0-sMv.iMvX) || (iMvc1-sMv.iMvY)) )
+		{
+			pFref2 = &pMe->pRefMb[iMvc1*iStrideRef+iMvc0];
+
+			iSadCost = pSad( kpEncMb, iStrideEnc, pFref2, iStrideRef ) +
+				COST_MVD(kpMvdCost, (iMvc0<<2) - ksMvp.iMvX, (iMvc1<<2) - ksMvp.iMvY);		
+			
+			if( iSadCost < iBestSadCost )
+			{
+				sMv.iMvX = iMvc0;
+				sMv.iMvY = iMvc1;				
+				pRefMb = pFref2;				
+				iBestSadCost = iSadCost;				
+			}
+		}
+	}
+
+	pMe->sMv = sMv;
+	pMe->uiSadCost = iBestSadCost;
+	if ( iBestSadCost < pMe->uSadPredISatd.uiSadPred )
+	{
+        	//  Step 2: Initial early Stop	
+		/* -> qpel mv */
+		pMe->sMv.iMvX <<= 2;
+		pMe->sMv.iMvY <<= 2;		
+		/* -> pRef */
+		pMe->pRefMb = pRefMb;
+		/* compute the real cost */  
+ 		pMe->uiSatdCost = iBestSadCost;
+	}
+    else
+    {
+        //  Step 3: Fast search pattern
+        WelsMotionEstimateIterativeSearch ( pFuncList, pMe, iStrideEnc, iStrideRef, pRefMb );
+    }
+}
+
+bool_t WelsMeSadCostSelect( int32_t *iSadCost, const uint16_t *kpMvdCost, int32_t *pBestCost, const int32_t kiDx, const int32_t kiDy, int32_t *pIx, int32_t *pIy)
+{
+	int32_t iTempSadCost[4];
+	int32_t iInputSadCost=*pBestCost;
+	iTempSadCost[0] = iSadCost[0]+COST_MVD(kpMvdCost, kiDx, kiDy - 4);
+	iTempSadCost[1] = iSadCost[1]+COST_MVD(kpMvdCost, kiDx, kiDy + 4);
+	iTempSadCost[2] = iSadCost[2]+COST_MVD(kpMvdCost, kiDx - 4, kiDy);
+	iTempSadCost[3] = iSadCost[3]+COST_MVD(kpMvdCost, kiDx + 4, kiDy);
+
+	if (iTempSadCost[0]<*pBestCost)
+	{
+		*pBestCost = iTempSadCost[0];
+		*pIx = 0;
+		*pIy = 1;
+	}
+
+	if (iTempSadCost[1]<*pBestCost)
+	{
+		*pBestCost = iTempSadCost[1];
+		*pIx = 0;
+		*pIy = -1;
+	}
+
+	if (iTempSadCost[2]<*pBestCost)
+	{
+		*pBestCost = iTempSadCost[2];
+		*pIx = 1;
+		*pIy = 0;
+	}
+
+	if (iTempSadCost[3]<*pBestCost)
+	{
+		*pBestCost = iTempSadCost[3];
+		*pIx = -1;
+		*pIy = 0;
+	}
+
+
+	return (*pBestCost==iInputSadCost);
+}
+
+void WelsMotionEstimateIterativeSearch( SWelsFuncPtrList *pFuncList, SWelsME *pMe, const int32_t kiStrideEnc, const int32_t kiStrideRef, uint8_t *pFref )
+{
+	PSample4SadCostFunc			pSad					=  pFuncList->sSampleDealingFuncs.pfSample4Sad[pMe->uiPixel];
+
+	uint8_t* const kpEncMb = pMe->pEncMb;
+	const uint16_t *kpMvdCost = pMe->pMvdCost;
+
+	int32_t iMvDx = ((pMe->sMv.iMvX)<<2) - pMe->sMvp.iMvX;
+	int32_t iMvDy = ((pMe->sMv.iMvY)<<2) - pMe->sMvp.iMvY;
+
+	uint8_t *pRefMb = pFref;
+	int32_t iBestCost = (pMe->uiSadCost);
+
+	int32_t iTimeThreshold = ITERATIVE_TIMES;
+	ENFORCE_STACK_ALIGN_1D(int32_t, iSadCosts, 4, 16)	
+
+	while(iTimeThreshold--)
+	{
+		pSad( kpEncMb,kiStrideEnc,pRefMb,kiStrideRef,&iSadCosts[0] );
+
+		int32_t iX,iY;
+
+		const bool_t kbIsBestCostWorse = WelsMeSadCostSelect( iSadCosts, kpMvdCost, &iBestCost,iMvDx, iMvDy,&iX,&iY );
+		if (kbIsBestCostWorse)
+			break;
+
+		iMvDx -= iX<<2 ;	
+		iMvDy -= iY<<2 ;
+
+		pRefMb -= (iX+iY*kiStrideRef);
+
+	}
+
+    /* -> qpel mv */
+	pMe->sMv.iMvX = (iMvDx + pMe->sMvp.iMvX) & 0xFFFC;
+	pMe->sMv.iMvY = (iMvDy + pMe->sMvp.iMvY) & 0xFFFC;
+	pMe->uiSatdCost = pMe->uiSadCost = (iBestCost);
+	pMe->pRefMb = pRefMb;
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/svc_set_mb_syn_cavlc.cpp
@@ -1,0 +1,385 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	svc_set_mb_syn_cavlc.h
+ *
+ * \brief	Seting all syntax elements of mb and decoding residual with cavlc
+ *
+ * \date	2009.8.12 Created 
+ *
+ *************************************************************************************
+ */
+
+#include "svc_enc_golomb.h"
+#include "vlc_encoder.h"
+#include "ls_defines.h"
+#include "svc_set_mb_syn_cavlc.h"
+
+namespace WelsSVCEnc {
+const uint32_t g_kuiIntra4x4CbpMap[48] =
+{
+	3, 29, 30, 17, 31, 18, 37,  8, 32, 38, 19,  9, 20, 10, 11, 2, //15
+	16, 33, 34, 21, 35, 22, 39,  4, 36, 40, 23,  5, 24,  6,  7, 1, //31
+	41, 42, 43, 25, 44, 26, 46, 12, 45, 47, 27, 13, 28, 14, 15, 0  //47
+};
+
+const uint32_t g_kuiInterCbpMap[48] = 
+{
+	0,  2,  3,  7,  4,  8, 17, 13,  5, 18,  9, 14, 10, 15, 16, 11, //15
+	1, 32, 33, 36, 34, 37, 44, 40, 35, 45, 38, 41, 39, 42, 43, 19, //31
+	6, 24, 25, 20, 26, 21, 46, 28, 27, 47, 22, 29, 23, 30, 31, 12 //47
+};	
+
+//============================Enhance Layer CAVLC Writing===========================
+void WelsSpatialWriteMbPred( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb )
+{
+	SMbCache* pMbCache	= &pSlice->sMbCacheInfo;
+	SBitStringAux *pBs	= pSlice->pSliceBsa;
+	SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
+	int32_t iNumRefIdxl0ActiveMinus1 = pSliceHeadExt->sSliceHeader.uiNumRefIdxL0Active - 1;
+
+	Mb_Type uiMbType = pCurMb->uiMbType;
+	int32_t iCbpChroma = pCurMb->uiCbp >> 4;
+	int32_t iCbpLuma   = pCurMb->uiCbp & 15;
+	int32_t i = 0;
+
+	SMVUnitXY sMvd[2];
+    bool_t* pPredFlag;
+    int8_t* pRemMode;
+
+	int32_t iMbOffset = 0;
+
+	switch( pSliceHeadExt->sSliceHeader.eSliceType )
+    {
+        case I_SLICE:
+            iMbOffset = 0;
+            break;
+        case P_SLICE:
+            iMbOffset = 5;
+            break;
+        default:
+            return;
+    }	
+
+	switch ( uiMbType )
+	{		
+	case MB_TYPE_INTRA4x4:			
+		/* mb type */
+        BsWriteUE( pBs, iMbOffset + 0 );
+		
+        /* prediction: luma */
+        pPredFlag = &pMbCache->pPrevIntra4x4PredModeFlag[0];
+        pRemMode  = &pMbCache->pRemIntra4x4PredModeFlag[0];
+		do
+        {
+            BsWriteOneBit( pBs, *pPredFlag );  /* b_prev_intra4x4_pred_mode */
+ 
+			if ( !*pPredFlag )
+            {
+                BsWriteBits( pBs, 3, *pRemMode );
+            }
+			
+			pPredFlag++;
+			pRemMode++;
+			++ i;
+        } while (i < 16);
+
+        /* prediction: chroma */		
+		BsWriteUE( pBs, g_kiMapModeIntraChroma[pMbCache->uiChmaI8x8Mode] );
+
+		break;
+
+	case MB_TYPE_INTRA16x16:		
+		/* mb type */
+		BsWriteUE( pBs, 1 + iMbOffset + g_kiMapModeI16x16[pMbCache->uiLumaI16x16Mode] + (iCbpChroma << 2) + ( iCbpLuma == 0 ? 0 : 12 ) );
+		
+        /* prediction: chroma */		
+		BsWriteUE( pBs, g_kiMapModeIntraChroma[pMbCache->uiChmaI8x8Mode] );
+
+		break;
+
+	case MB_TYPE_16x16:	
+        BsWriteUE( pBs, 0 );//uiMbType
+		sMvd[0].sDeltaMv(pCurMb->sMv[0], pMbCache->sMbMvp[0]);
+
+		if ( iNumRefIdxl0ActiveMinus1 > 0 )
+		{
+			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0] );
+		}
+
+		BsWriteSE( pBs, sMvd[0].iMvX);
+		BsWriteSE( pBs, sMvd[0].iMvY);
+		
+		break;
+
+	case MB_TYPE_16x8:
+		BsWriteUE( pBs, 1 );//uiMbType
+		
+		sMvd[0].sDeltaMv(pCurMb->sMv[0], pMbCache->sMbMvp[0]);
+		sMvd[1].sDeltaMv(pCurMb->sMv[8], pMbCache->sMbMvp[1]);
+
+		if ( iNumRefIdxl0ActiveMinus1 > 0 )
+		{
+			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0] );
+			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[2] );
+		}
+		BsWriteSE( pBs, sMvd[0].iMvX );//block0
+		BsWriteSE( pBs, sMvd[0].iMvY );
+		BsWriteSE( pBs, sMvd[1].iMvX );//block1
+		BsWriteSE( pBs, sMvd[1].iMvY );
+		
+		break;
+
+	case MB_TYPE_8x16:		
+		BsWriteUE( pBs, 2 );//uiMbType
+		
+		sMvd[0].sDeltaMv(pCurMb->sMv[0], pMbCache->sMbMvp[0]);
+		sMvd[1].sDeltaMv(pCurMb->sMv[2], pMbCache->sMbMvp[1]);
+
+		if ( iNumRefIdxl0ActiveMinus1 > 0 )
+		{
+			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0] );
+			BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[1] );
+		}
+		BsWriteSE( pBs, sMvd[0].iMvX );//block0
+		BsWriteSE( pBs, sMvd[0].iMvY );
+		BsWriteSE( pBs, sMvd[1].iMvX );//block1
+		BsWriteSE( pBs, sMvd[1].iMvY );
+		
+		break;
+	}
+}
+
+void WelsSpatialWriteSubMbPred( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb )
+{
+	SMbCache* pMbCache	= &pSlice->sMbCacheInfo;
+	SBitStringAux *pBs	= pSlice->pSliceBsa;
+	SSliceHeaderExt* pSliceHeadExt = &pSlice->sSliceHeaderExt;
+
+	int32_t iNumRefIdxl0ActiveMinus1 = pSliceHeadExt->sSliceHeader.uiNumRefIdxL0Active - 1;
+	int32_t i;
+
+	bool_t bSubRef0 = false;	
+	const uint8_t* kpScan4 = &(g_kuiMbCountScan4Idx[0]);
+
+	/* mb type */
+	if ( LD32(pCurMb->pRefIndex) == 0 )
+	{
+		BsWriteUE( pBs, 4 );
+		bSubRef0 = false;
+	}
+	else
+	{
+		BsWriteUE( pBs, 3 );
+		bSubRef0 = true;
+	}
+
+	//step 1: sub_mb_type
+	for ( i = 0; i < 4; i++ )
+	{
+		BsWriteUE( pBs, 0 );
+	}
+
+	//step 2: get and write uiRefIndex and sMvd
+	if ( iNumRefIdxl0ActiveMinus1 > 0 && bSubRef0 ) 
+	{
+		BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[0] );
+		BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[1] );
+		BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[2] );
+		BsWriteTE( pBs, iNumRefIdxl0ActiveMinus1, pCurMb->pRefIndex[3] );
+	}			
+	//write sMvd
+	for ( i = 0; i < 4; i++ )
+	{
+		BsWriteSE( pBs, pCurMb->sMv[*kpScan4].iMvX - pMbCache->sMbMvp[i].iMvX );
+		BsWriteSE( pBs, pCurMb->sMv[*kpScan4].iMvY - pMbCache->sMbMvp[i].iMvY );
+		kpScan4 += 4;
+	}
+}
+
+//============================Base Layer CAVLC Writing===============================
+void WelsSpatialWriteMbSyn( sWelsEncCtx *pEncCtx, SSlice *pSlice, SMB *pCurMb )
+{
+	SBitStringAux *pBs = pSlice->pSliceBsa;
+	SMbCache* pMbCache = &pSlice->sMbCacheInfo;
+	
+	/* Step 1: write mb type and pred */
+	if ( IS_Inter_8x8(pCurMb->uiMbType))
+	{
+		WelsSpatialWriteSubMbPred( pEncCtx, pSlice, pCurMb );
+	}
+	else
+	{
+		WelsSpatialWriteMbPred( pEncCtx, pSlice, pCurMb );
+	}
+
+	/* Step 2: write coded block patern */	
+    if( IS_INTRA4x4 ( pCurMb->uiMbType ) )
+    {
+        BsWriteUE( pBs, g_kuiIntra4x4CbpMap[pCurMb->uiCbp] );
+    }
+    else if( !IS_INTRA16x16(pCurMb->uiMbType) )
+    {
+        BsWriteUE( pBs, g_kuiInterCbpMap[pCurMb->uiCbp] );
+    }
+
+	/* Step 3: write QP and residual */
+	if( pCurMb->uiCbp > 0 || IS_INTRA16x16(pCurMb->uiMbType) )
+	{
+		const int32_t kiDeltaQp = pCurMb->uiLumaQp - pSlice->uiLastMbQp;
+		pSlice->uiLastMbQp = pCurMb->uiLumaQp;		
+
+        BsWriteSE( pBs, kiDeltaQp );		
+		WelsWriteMbResidual( pMbCache, pCurMb, pBs );
+	}
+	else
+	{
+		pCurMb->uiLumaQp = pSlice->uiLastMbQp;
+		pCurMb->uiChromaQp = g_kuiChromaQpTable[CLIP3_QP_0_51(pCurMb->uiLumaQp + pEncCtx->pCurDqLayer->sLayerInfo.pPpsP->uiChromaQpIndexOffset)];
+	}
+}
+
+void WelsWriteMbResidual( SMbCache* sMbCacheInfo, SMB *pCurMb, SBitStringAux *pBs )
+{
+	int32_t i;
+	Mb_Type uiMbType					= pCurMb->uiMbType;	
+	const int32_t kiCbpChroma		= pCurMb->uiCbp >> 4;
+	const int32_t kiCbpLuma			= pCurMb->uiCbp & 0x0F;
+	int8_t *pNonZeroCoeffCount	= sMbCacheInfo->iNonZeroCoeffCount;
+	int16_t *pBlock;
+	int8_t iA, iB, iC;
+
+	if ( IS_INTRA16x16(uiMbType) )
+	{		
+        /* DC luma */
+		iA = pNonZeroCoeffCount[8];
+		iB = pNonZeroCoeffCount[ 1];
+		WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
+		WriteBlockResidualCavlc( sMbCacheInfo->pDct->iLumaI16x16Dc, 15, 1, LUMA_4x4, iC, pBs);
+
+		/* AC Luma */
+        if( kiCbpLuma )
+        {
+			pBlock = sMbCacheInfo->pDct->iLumaBlock[0];		
+		
+			for( i=0;i<16;i++ )
+            {
+				int32_t iIdx = g_kuiCache48CountScan4Idx[i];
+				iA = pNonZeroCoeffCount[iIdx-1];
+				iB = pNonZeroCoeffCount[iIdx-8];
+				WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
+				WriteBlockResidualCavlc( pBlock, 14, pNonZeroCoeffCount[iIdx]>0, LUMA_AC, iC, pBs);
+				pBlock += 16;				
+            } 
+        }		
+	}
+	else
+	{
+        /* Luma DC AC */
+        if ( kiCbpLuma )
+		{			
+			pBlock = sMbCacheInfo->pDct->iLumaBlock[0];
+			
+			for( i=0; i<16; i+=4 )
+            {
+				if( kiCbpLuma & (1 << (i >> 2)) )
+				{
+					int32_t iIdx = g_kuiCache48CountScan4Idx[i];
+					const int8_t kiA = pNonZeroCoeffCount[iIdx];
+					const int8_t kiB = pNonZeroCoeffCount[iIdx+1];
+					const int8_t kiC = pNonZeroCoeffCount[iIdx+8];
+					const int8_t kiD = pNonZeroCoeffCount[iIdx+9];
+					iA = pNonZeroCoeffCount[iIdx-1];
+					iB = pNonZeroCoeffCount[iIdx-8];
+					WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
+					WriteBlockResidualCavlc( pBlock, 15, kiA>0, LUMA_4x4, iC, pBs );
+
+					iA = kiA;
+					iB = pNonZeroCoeffCount[iIdx-7];
+					WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
+					WriteBlockResidualCavlc( pBlock + 16, 15, kiB>0, LUMA_4x4, iC, pBs );
+
+					iA = pNonZeroCoeffCount[iIdx+7];
+					iB = kiA;
+					WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
+					WriteBlockResidualCavlc( pBlock + 32, 15, kiC>0, LUMA_4x4, iC, pBs );
+
+					iA = kiC;
+					iB = kiB;
+					WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
+					WriteBlockResidualCavlc( pBlock + 48, 15, kiD>0, LUMA_4x4, iC, pBs );
+				}
+				pBlock += 64;				
+           } 
+        }				
+	}
+
+    if( kiCbpChroma )
+    {
+        /* Chroma DC residual present */
+		pBlock = sMbCacheInfo->pDct->iChromaDc[0]; // Cb
+        WriteBlockResidualCavlc( pBlock, 3, 1, CHROMA_DC, CHROMA_DC_NC_OFFSET, pBs );
+		
+		pBlock += 4; // Cr
+		WriteBlockResidualCavlc( pBlock, 3, 1, CHROMA_DC, CHROMA_DC_NC_OFFSET, pBs );
+ 
+		/* Chroma AC residual present */
+        if( kiCbpChroma & 0x02 ) 
+        {
+			const uint8_t *kCache48CountScan4Idx16base = &g_kuiCache48CountScan4Idx[16];
+			pBlock = sMbCacheInfo->pDct->iChromaBlock[0]; // Cb
+			
+			for( i=0; i<4; i++ )
+            {
+				int32_t iIdx = kCache48CountScan4Idx16base[i];
+				iA = pNonZeroCoeffCount[iIdx-1];
+				iB = pNonZeroCoeffCount[iIdx-8];
+				WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
+				WriteBlockResidualCavlc( pBlock, 14, pNonZeroCoeffCount[iIdx]>0, CHROMA_AC, iC, pBs );
+				pBlock += 16;			
+            }
+
+			pBlock = sMbCacheInfo->pDct->iChromaBlock[4]; // Cr
+		
+			for( i=0;i<4;i++ )
+            {
+				int32_t iIdx = 24+kCache48CountScan4Idx16base[i];
+				iA = pNonZeroCoeffCount[iIdx-1];
+				iB = pNonZeroCoeffCount[iIdx-8];
+				WELS_NON_ZERO_COUNT_AVERAGE(iC,iA,iB);
+				WriteBlockResidualCavlc( pBlock, 14,pNonZeroCoeffCount[iIdx]>0, CHROMA_AC, iC, pBs );
+				pBlock += 16;			
+            }
+        }
+    }	
+}
+
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/core/src/utils.cpp
@@ -1,0 +1,513 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	utils.c
+ *
+ * \brief	common tool/function utilization
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <math.h>
+#include <time.h>
+#if defined(WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#endif
+
+#include "utils.h"
+#include "macros.h"
+#include "wels_const.h"
+#include "property.h"
+#include "cpu_core.h"
+#include "encoder_context.h"
+#include "as264_common.h"
+#include "property.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+
+
+namespace WelsSVCEnc {
+
+void WelsLogDefault( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv );
+void WelsLogNil( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv );
+
+real32_t WelsCalcPsnr(	const void *kpTarPic,
+							const int32_t kiTarStride,
+							const void *kpRefPic,
+							const int32_t kiRefStride,
+							const int32_t kiWidth,
+							const int32_t kiHeight	);
+
+// to fill default routines
+#ifdef ENABLE_TRACE_FILE
+PWelsLogCallbackFunc wlog	= WelsLogDefault;
+#else
+PWelsLogCallbackFunc wlog	= WelsLogNil;
+#endif
+
+iWelsLogLevel		g_iLevelLog	= WELS_LOG_DEFAULT;	// default log iLevel
+int32_t			g_iSizeLogBuf	= 1024;			// pBuffer size for each log output
+
+/*
+ *	Log output routines
+ */
+
+/*!
+ * \brief	get log tag
+ * \param	kiLevel		log iLevel
+ * \return  tag of log iLevel
+ */
+static inline str_t *GetLogTag( const int32_t kiLevel, int32_t *pBit )
+{	
+	int32_t iShift	= 0;
+	int32_t iVal		= 0;
+	bool_t	bFound	= false;
+
+	if ( kiLevel <= 0 || kiLevel > (1 << (WELS_LOG_LEVEL_COUNT-1)) || NULL == pBit )
+		return NULL;
+
+	for(;;)
+	{
+		if ( iShift >= WELS_LOG_LEVEL_COUNT )
+			break;
+		iVal	= (1 << iShift);
+		if ( iVal == kiLevel )
+		{
+			bFound	= true;
+			break;
+		}
+		++ iShift;
+	}
+
+	if ( bFound )
+	{
+		*pBit	= iShift;
+		return (str_t *)g_sWelsLogTags[iShift];
+	}
+	return NULL;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	System trace log output in Wels
+ *
+ * \param	pCtx	instance pointer
+ * \param	kiLevel	log iLevel ( WELS_LOG_QUIET, ERROR, WARNING, INFO, DEBUG )
+ * \param	kpFmtStr	formated string to mount
+ * \param 	argv	pData string argument
+ *
+ * \return	NONE
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+void WelsLogDefault( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv )
+{
+	sWelsEncCtx *pEncCtx	= (sWelsEncCtx *)pCtx;
+	iWelsLogLevel		 iVal	= (kiLevel & g_iLevelLog);
+
+	if ( 0 == iVal || NULL == pEncCtx )	// such iLevel not enabled
+	{
+		return;
+	}
+	else
+	{
+		str_t pBuf[WELS_LOG_BUF_SIZE+1] = {0};		
+		const int32_t kiBufSize = sizeof(pBuf) / sizeof(pBuf[0]) - 1;
+		int32_t iCurUsed = 0;
+		int32_t iBufUsed = 0;
+		int32_t iBufLeft = kiBufSize - iBufUsed;
+		
+		if ( pEncCtx ){
+			time_t l_time;
+#if defined(WIN32)
+#if defined(_MSC_VER)
+#if _MSC_VER >= 1500
+			struct tm t_now;
+#else//VC6
+			struct tm* t_now;
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER
+#else//__GNUC__
+			struct tm* t_now;
+#endif//WIN32			
+			
+#if defined( WIN32 )
+			struct _timeb tb;
+			
+			time(&l_time);
+#ifdef _MSC_VER
+#if _MSC_VER >= 1500
+			LOCALTIME(&t_now, &l_time);
+#else
+			t_now = LOCALTIME(&l_time);
+			if ( NULL == t_now )
+			{
+				return;
+			}
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER			
+			FTIME(&tb);
+#elif defined( __GNUC__ )
+			struct timeval tv;
+			time(&l_time);
+			t_now = (struct tm *)LOCALTIME(&l_time);
+			gettimeofday(&tv,NULL);
+#endif//WIN32
+			if (iBufLeft > 0){
+#ifdef _MSC_VER
+#if _MSC_VER >= 1500
+				iCurUsed = SNPRINTF( &pBuf[iBufUsed], iBufLeft, iBufLeft, "[0x%p @ ", pEncCtx );	// confirmed_safe_unsafe_usage
+#else
+				iCurUsed = SNPRINTF( &pBuf[iBufUsed], iBufLeft, "[0x%p @ ", pEncCtx );	// confirmed_safe_unsafe_usage
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER
+				if (iCurUsed >= 0){
+					iBufUsed += iCurUsed;
+					iBufLeft -= iCurUsed;
+				}				
+			}
+			else{
+				return;
+			}
+
+			if ( iBufLeft > 0 ){			
+				iCurUsed = GetCodeName( &pBuf[iBufUsed], iBufLeft );
+				if ( iCurUsed > 0 ){
+					iBufUsed += iCurUsed;
+					iBufLeft -= iCurUsed;
+				}
+				pBuf[iBufUsed] = ' ';
+				++ iBufUsed;
+				-- iBufLeft;
+				
+				iCurUsed = GetLibName( &pBuf[iBufUsed], iBufLeft );
+				if ( iCurUsed > 0 ){
+					iBufUsed += iCurUsed;
+					iBufLeft -= iCurUsed;
+				}
+				pBuf[iBufUsed] = ' ';
+				++ iBufUsed;
+				-- iBufLeft;
+
+				pBuf[iBufUsed] = 'v';
+				++ iBufUsed;
+				-- iBufLeft;		
+				iCurUsed = GetVerNum( &pBuf[iBufUsed], iBufLeft );
+				if ( iCurUsed > 0 ){
+					iBufUsed += iCurUsed;
+					iBufLeft -= iCurUsed;
+				}
+				pBuf[iBufUsed] = ' ';
+				++ iBufUsed;
+				-- iBufLeft;				
+			}
+
+			if (iBufLeft > 0){
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER >= 1500)
+				iCurUsed = strftime(&pBuf[iBufUsed], iBufLeft, "%y-%m-%d %H:%M:%S", &t_now);
+#else
+				iCurUsed = strftime(&pBuf[iBufUsed], iBufLeft, "%y-%m-%d %H:%M:%S", t_now);
+#endif//WIN32..
+				if (iCurUsed > 0){
+					iBufUsed += iCurUsed;
+					iBufLeft -= iCurUsed;
+				}
+			}
+			else{
+				return;
+			}
+
+			if (iBufLeft > 0){
+#if defined (WIN32)
+#ifdef _MSC_VER
+#if _MSC_VER >= 1500
+				iCurUsed = SNPRINTF(&pBuf[iBufUsed], iBufLeft, iBufLeft, ".%03.3u]: ", tb.millitm);	// confirmed_safe_unsafe_usage
+#else
+				iCurUsed = SNPRINTF(&pBuf[iBufUsed], iBufLeft, ".%3.3u]: ", tb.millitm);	// confirmed_safe_unsafe_usage
+#endif//_MSC_VER >= 1500
+#endif//_MSC_VER
+#elif defined (__GNUC__)
+				iCurUsed = SNPRINTF(&pBuf[iBufUsed], iBufLeft, ".%3.3u]: ", tv.tv_usec/1000);	// confirmed_safe_unsafe_usage
+#endif//WIN32
+				if (iCurUsed >= 0){
+					iBufUsed += iCurUsed;
+					iBufLeft -= iCurUsed;
+				}
+			}
+			else{
+				return;
+			}
+		}
+
+		// fixed stack corruption issue on vs2008
+		if ( iBufLeft > 0 ){
+			int32_t i_shift = 0;			
+			str_t *pStr = NULL;
+			pStr	= GetLogTag( kiLevel, &i_shift );
+			if ( NULL != pCtx){
+				int32_t iLenTag = STRNLEN( pStr, 8 );	// confirmed_safe_unsafe_usage
+				STRCAT( &pBuf[iBufUsed], iBufLeft, pStr );	// confirmed_safe_unsafe_usage
+				iBufUsed += iLenTag;
+				pBuf[iBufUsed] = ' ';
+				iBufUsed++;
+				++iLenTag;
+				iBufLeft -= iLenTag;
+			}			
+		}
+		if (iBufLeft > 0){
+#if defined(WIN32) && defined(_MSC_VER) && (_MSC_VER >= 1500)
+			int32_t len = 0;
+			len = _vscprintf( kpFmtStr, argv ) // _vscprintf doesn't count
+					+ 1; // terminating '\0'
+			iCurUsed = VSPRINTF(&pBuf[iBufUsed], len, kpFmtStr, argv);	// confirmed_safe_unsafe_usage
+#else
+			iCurUsed = VSPRINTF(&pBuf[iBufUsed], kpFmtStr, argv);	// confirmed_safe_unsafe_usage
+#endif//WIN32..
+			if (iCurUsed > 0){
+				iBufUsed += iCurUsed;
+				iBufLeft -= iCurUsed;
+			}
+		}
+#ifdef ENABLE_TRACE_FILE
+		if (NULL != pEncCtx && NULL != pEncCtx->pFileLog){
+			if ( pEncCtx->uiSizeLog > MAX_TRACE_LOG_SIZE){
+				if (0 == fseek(pEncCtx->pFileLog, 0L, SEEK_SET))
+					pEncCtx->uiSizeLog = 0;
+			}
+			if ( iBufUsed > 0 && iBufUsed < WELS_LOG_BUF_SIZE )
+			{
+				iCurUsed = fwrite(pBuf, 1, iBufUsed, pEncCtx->pFileLog);
+				fflush( pEncCtx->pFileLog );
+				if ( iCurUsed == iBufUsed )
+					pEncCtx->uiSizeLog += iBufUsed;
+			}			
+		}
+		else{
+#if defined(WIN32) && defined(_DEBUG)
+			OutputDebugStringA(pBuf);
+#endif
+		}
+#endif//ENABLE_TRACE_FILE
+	}	
+}
+void WelsLogNil( void *pCtx, const int32_t kiLevel, const str_t *kpFmtStr, va_list argv )
+{
+	// NULL implementation
+}
+
+/*! 
+*************************************************************************************
+* \brief	reopen log file when finish setting current path
+*
+* \param	pCtx		context pCtx
+* \param	pCurPath	current path string
+*
+* \return	NONE
+*
+* \note	N/A
+*************************************************************************************
+*/
+void WelsReopenTraceFile( void *pCtx, str_t *pCurPath )
+{
+#ifdef ENABLE_TRACE_FILE
+	sWelsEncCtx *pEncCtx	= (sWelsEncCtx *)pCtx;
+	if (wlog == WelsLogDefault)
+	{
+		str_t strTraceFile[MAX_FNAME_LEN] = {0};
+		int32_t len = 0;
+		if (pEncCtx->pFileLog != NULL)
+		{
+			fclose(pEncCtx->pFileLog);
+			pEncCtx->pFileLog = NULL;
+		}
+		pEncCtx->uiSizeLog	= 0;
+		len = STRNLEN( pCurPath, MAX_FNAME_LEN-1 );	// confirmed_safe_unsafe_usage
+		if (len >= MAX_FNAME_LEN)
+			return;
+		STRNCPY(strTraceFile, MAX_FNAME_LEN, pCurPath, len);	// confirmed_safe_unsafe_usage
+#ifdef __GNUC__		
+		STRCAT(strTraceFile, MAX_FNAME_LEN-len, "/wels_encoder_trace.txt");	// confirmed_safe_unsafe_usage
+		pEncCtx->pFileLog	= FOPEN(strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
+#elif WIN32
+		STRCAT(strTraceFile, MAX_FNAME_LEN-len, "\\wels_encoder_trace.txt");// confirmed_safe_unsafe_usage
+#if _MSC_VER >= 1500
+		FOPEN(&pEncCtx->pFileLog, strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
+#else
+		pEncCtx->pFileLog	= FOPEN(strTraceFile, "wt+");	// confirmed_safe_unsafe_usage
+#endif//_MSC_VER>=1500
+#else		
+#endif//__GNUC__
+	}
+#endif//ENABLE_TRACE_FILE
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	set log iLevel from external call
+ *
+ * \param	iLevel	iLevel of log 
+ *
+ * \return	NONE
+ *
+ * \note	can be able to control log iLevel dynamically
+ *************************************************************************************
+ */
+void WelsSetLogLevel( const int32_t kiLevel )
+{
+	iWelsLogLevel iVal = 0;
+	if ( kiLevel & WELS_LOG_ERROR )
+	{
+		iVal |= WELS_LOG_ERROR;
+	}
+	if ( kiLevel & WELS_LOG_WARNING )
+	{
+		iVal |= WELS_LOG_WARNING;
+	}
+	if ( kiLevel & WELS_LOG_INFO )
+	{
+		iVal |= WELS_LOG_INFO;
+	}
+	if ( kiLevel & WELS_LOG_DEBUG )
+	{
+		iVal |= WELS_LOG_DEBUG;
+	}
+	g_iLevelLog	= iVal;	
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	get log iLevel from external call
+ *
+ * \param	N/A
+ *
+ * \return	current iLevel of log used in codec internal
+ *
+ * \note	can be able to get log iLevel of internal codec applicable
+ *************************************************************************************
+ */
+int32_t WelsGetLogLevel( void )
+{
+	return g_iLevelLog;
+}
+
+/*! 
+ *************************************************************************************
+ * \brief	set log callback from external call
+ *
+ * \param	_log	log function routine
+ *
+ * \return	NONE
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+void WelsSetLogCallback( PWelsLogCallbackFunc _log )
+{
+	wlog	= _log;
+}
+
+void WelsLogCall(void *pCtx, int32_t iLevel, const str_t *kpFmt, va_list vl)
+{
+    wlog(pCtx, iLevel, kpFmt, vl);
+}
+
+void WelsLog(void *pCtx, int32_t iLevel, const str_t *kpFmt, ...)
+{
+    va_list vl;
+    va_start(vl, kpFmt);
+    WelsLogCall(pCtx, iLevel, kpFmt, vl);
+    va_end(vl);
+}
+
+#ifndef CALC_PSNR
+#define CONST_FACTOR_PSNR	(10.0 / log(10.0))	// for good computation
+#define CALC_PSNR(w, h, s)	((real32_t)(CONST_FACTOR_PSNR * log( 65025.0 * w * h / iSqe )))
+#endif//CALC_PSNR
+
+/*
+ *	PSNR calculation routines
+ */
+/*! 
+ *************************************************************************************
+ * \brief	PSNR calculation utilization in Wels
+ *
+ * \param	pTarPic		target picture to be calculated in Picture pData format
+ * \param	iTarStride	stride of target picture pData pBuffer
+ * \param 	pRefPic		base referencing picture samples
+ * \param	iRefStride	stride of reference picture pData pBuffer
+ * \param	iWidth		picture iWidth in pixel
+ * \param	iHeight		picture iHeight in pixel
+ *
+ * \return	actual PSNR result;
+ *
+ * \note	N/A
+ *************************************************************************************
+ */
+real32_t WelsCalcPsnr(	const void *kpTarPic,
+							const int32_t kiTarStride,
+							const void *kpRefPic,
+							const int32_t kiRefStride,
+							const int32_t kiWidth,
+							const int32_t kiHeight )
+{
+	int64_t	iSqe = 0;
+	int32_t x, y;
+	uint8_t *pTar = (uint8_t *)kpTarPic;
+	uint8_t *pRef = (uint8_t *)kpRefPic;
+
+	if ( NULL == pTar || NULL == pRef )
+		return (-1.0f);
+
+	for ( y = 0; y < kiHeight; ++ y )	// OPTable !!
+	{
+		for ( x = 0; x < kiWidth; ++ x )
+		{
+			const int32_t kiT = pTar[y*kiTarStride+x] - pRef[y*kiRefStride+x];
+			iSqe	+= kiT * kiT;
+		}
+	}
+	if ( 0 == iSqe )
+	{
+		return (99.99f);
+	}
+	return CALC_PSNR( kiWidth, kiHeight, iSqe );
+}
+
+
+}
--- /dev/null
+++ b/codec/encoder/core/src/wels_preprocess.cpp
@@ -1,0 +1,1126 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#if defined(WIN32)
+#include <windows.h>
+#elif defined(MACOS)
+#include "bundleloader.h"
+#elif defined(__GNUC__)
+#include <dlfcn.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include "wels_preprocess.h"
+#include "memory_align.h"
+#include "encoder.h"
+#include "extern.h"
+#include "picture_handle.h"
+#include "encoder_context.h"
+#include "utils.h"
+
+
+namespace WelsSVCEnc {
+
+#define WelsSafeDelete(p) if(p){ delete (p); (p) = NULL; }
+
+//***** entry API declaration ************************************************************************//
+typedef EResult (WELSAPI *pfnCreateVpInterface)  (void **, int );
+typedef EResult (WELSAPI *pfnDestroyVpInterface) (void * , int );
+
+int32_t WelsInitScaledPic( SWelsSvcCodingParam *pParam,  Scaled_Picture  *pScaledPic, CMemoryAlign *pMemoryAlign );
+bool_t  JudgeNeedOfScaling( SWelsSvcCodingParam *pParam, Scaled_Picture * pScaledPic );
+void    FreeScaledPic( Scaled_Picture  *pScaledPic, CMemoryAlign *pMemoryAlign );
+
+//******* table definition ***********************************************************************//
+const uint8_t g_kuiRefTemporalIdx[MAX_TEMPORAL_LEVEL][MAX_GOP_SIZE] =
+{
+	{  0, }, // 0
+	{  0,  0, }, // 1
+	{  0,  0,  0,  1, }, // 2
+	{  0,  0,  0,  2,  0,  1,  1,  2, }, // 3
+	{  0,  0,  0,  3,  0,  2,  2,  3,  0,  1,  1,  3,  1,  2,  2,  3 }  // 4
+};
+
+const int32_t g_kiPixMapSizeInBits = sizeof(uint8_t) * 8;
+
+
+inline  void   WelsUpdateSpatialIdxMap(sWelsEncCtx * pEncCtx, int32_t iPos, SPicture * pPic, int32_t iDidx)
+{
+    pEncCtx->sSpatialIndexMap[iPos].pSrc = pPic;
+	pEncCtx->sSpatialIndexMap[iPos].iDid = iDidx;
+}
+
+
+//***************************************************************************************************//
+CWelsLib::CWelsLib(void *pEncCtx)
+{
+	m_pInterface[0] = m_pInterface[1] = NULL;
+#if defined(WIN32)
+	const str_t WelsVPLib[] = "welsvp.dll";
+	HMODULE shModule = LoadLibrary(WelsVPLib);
+	if(!shModule)
+		WelsLog( pEncCtx, WELS_LOG_ERROR, "welsvp load lib dynamic failed module=%x\n", shModule );
+
+#elif defined(MACOS)
+	const str_t WelsVPLib[] = "welsvp.bundle";
+	str_t pCurPath[256];
+	GetCurrentModulePath(pCurPath, 256);
+	strlcat(pCurPath, WelsVPLib, 256);	
+	CFBundleRef shModule = LoadBundle(pCurPath);
+	if(!shModule)
+		WelsLog( pEncCtx, WELS_LOG_ERROR, "welsvp load lib dynamic failed module=%x\n", shModule );
+
+#elif defined(__GNUC__)
+	const str_t WelsVPLib[] = "./libwelsvp.so";
+	void* shModule = NULL;
+	shModule = dlopen(WelsVPLib, RTLD_LAZY);
+	if (shModule == NULL)
+		printf("dlopen %s iRet=%x, err=%s\n", WelsVPLib, shModule, dlerror());
+#endif
+
+	m_pVpLib = (void *)shModule;
+}
+
+CWelsLib::~CWelsLib()
+{
+	if (m_pVpLib)
+	{
+#if defined(WIN32)
+		HMODULE shModule = (HMODULE)m_pVpLib;
+		FreeLibrary(shModule);
+
+#elif defined(MACOS)
+		CFBundleRef shModule = (CFBundleRef)m_pVpLib;
+		FreeBundle(shModule);
+
+#elif defined(__GNUC__)
+		void* shModule = m_pVpLib;
+		dlclose(shModule);
+#endif
+		m_pVpLib = NULL;
+	}
+}
+
+void* CWelsLib::QueryFunction(const str_t *pName)
+{
+	void *pFunc = NULL;
+	if (m_pVpLib)
+	{
+#if defined(WIN32)
+		HMODULE shModule = (HMODULE)m_pVpLib;
+		pFunc = (void *)GetProcAddress(shModule, pName);
+
+#elif defined(MACOS)
+		CFBundleRef shModule = (CFBundleRef)m_pVpLib;
+		pFunc = (void *)GetProcessAddress(shModule, pName);
+
+#elif defined(__GNUC__)
+		void* shModule = m_pVpLib;
+		pFunc = (void *)dlsym(shModule, pName);
+		if (pFunc == NULL)
+			printf("dlsym %s iRet=%p, err=%s\n", shModule, pFunc, dlerror());
+#endif
+	}
+	return pFunc;
+}
+
+int32_t CWelsLib::CreateIface(void **ppEncCtx)
+{
+	if (m_pVpLib)
+	{
+		pfnCreateVpInterface  pCreateVpInterface  = NULL;
+		pfnDestroyVpInterface pDestroyVpInterface = NULL;
+
+		pCreateVpInterface  = (pfnCreateVpInterface)  QueryFunction("CreateVpInterface");
+		pDestroyVpInterface = (pfnDestroyVpInterface) QueryFunction("DestroyVpInterface");
+
+		m_pInterface[0] = (void *)pCreateVpInterface;
+		m_pInterface[1] = (void *)pDestroyVpInterface;
+
+		if (m_pInterface[0] && m_pInterface[1])
+			pCreateVpInterface(ppEncCtx, WELSVP_INTERFACE_VERION);
+	}
+	else
+	{
+	}	
+
+	return ppEncCtx ? 0 : 1;
+}
+
+int32_t CWelsLib::DestroyIface(void *pEncCtx)
+{
+	if (pEncCtx)
+	{
+		pfnDestroyVpInterface pDestroyVpInterface = (pfnDestroyVpInterface) m_pInterface[1];
+		if (pDestroyVpInterface)
+		{
+			pDestroyVpInterface(pEncCtx, WELSVP_INTERFACE_VERION);
+		}
+		else
+		{
+		}
+	}
+
+	return 0;
+}
+
+/***************************************************************************
+*	
+*	implement of the interface
+*	
+***************************************************************************/
+
+CWelsPreProcess::CWelsPreProcess(void *pEncCtx)
+{
+	m_pInterfaceVp = NULL;
+	m_pEncLib = NULL;
+	m_bInitDone = false;
+	m_bOfficialBranch  = FALSE;
+	m_pEncCtx = pEncCtx;
+	memset(&m_sScaledPicture, 0, sizeof(m_sScaledPicture));	
+}
+
+CWelsPreProcess::~CWelsPreProcess()
+{
+	FreeScaledPic(&m_sScaledPicture, static_cast<sWelsEncCtx *>(m_pEncCtx)->pMemAlign);
+	WelsPreprocessDestroy();
+}
+
+int32_t CWelsPreProcess::WelsPreprocessCreate()
+{
+	if (m_pEncLib == NULL && m_pInterfaceVp == NULL)
+	{
+		m_pEncLib  = new CWelsLib(m_pEncCtx);
+		if (!m_pEncLib)
+			goto exit;
+
+		m_pEncLib->CreateIface((void **)&m_pInterfaceVp);
+		if (!m_pInterfaceVp)
+			goto exit;
+	}
+	else 
+		goto exit;
+
+	return 0;
+
+exit:
+	WelsPreprocessDestroy();
+	return 1;
+}
+
+int32_t CWelsPreProcess::WelsPreprocessDestroy()
+{
+	if (m_pEncLib)
+	{	
+		m_pEncLib->DestroyIface((void *)m_pInterfaceVp);
+		m_pInterfaceVp = NULL;
+		WelsSafeDelete(m_pEncLib);
+	}
+
+	return 0;
+}
+
+int32_t CWelsPreProcess::WelsPreprocessReset ( void *pCtx )
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
+	int32_t iRet = -1;
+
+	if (pEncCtx)
+	{
+		FreeScaledPic(&m_sScaledPicture, pEncCtx->pMemAlign);
+        iRet = InitLastSpatialPictures(pEncCtx);
+		iRet = WelsInitScaledPic(pEncCtx->pSvcParam, &m_sScaledPicture, pEncCtx->pMemAlign);
+	}
+
+	return iRet;
+}
+
+int32_t CWelsPreProcess::WelsPreprocessStep1( void *pCtx, const SSourcePicture **kppSrcPicList, const int32_t kiConfiguredLayerNum )
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
+	SWelsSvcCodingParam *pSvcParam = pEncCtx->pSvcParam;
+	int32_t	iNumDependencyLayer = (int32_t)pSvcParam->iNumDependencyLayer;
+	int32_t iSpatialNum = 0;
+
+	if (!m_bInitDone)
+	{
+		if (WelsPreprocessCreate() != 0)
+			return -1;
+		if (WelsPreprocessReset(pEncCtx) != 0)
+			return -1;	
+
+		m_bOfficialBranch  = (iNumDependencyLayer != kiConfiguredLayerNum);
+		if ( !m_bOfficialBranch && (iNumDependencyLayer == 1) ) 
+		{
+			// check the input source uiSize to decide if need switch to official branch 
+			// NOTICE: the layernum=1 case is confused in official/non-official cases!
+			SSourcePicture **pic_queue = (SSourcePicture **)kppSrcPicList;
+			for (int32_t i=0; i<iNumDependencyLayer; i++)
+			{			
+				if ( pSvcParam->sDependencyLayers[i].iFrameWidth != pic_queue[i]->iPicWidth ||
+					pSvcParam->sDependencyLayers[i].iFrameHeight != pic_queue[i]->iPicHeight )
+				{
+					m_bOfficialBranch = TRUE;
+					break;
+				}		
+			}		
+		}
+		m_bInitDone = TRUE;
+	}
+
+	if (m_pInterfaceVp == NULL)
+		return -1;
+
+	if ( kiConfiguredLayerNum <= 0 )
+		return -1;	
+
+    pEncCtx->pVaa->bSceneChangeFlag = pEncCtx->pVaa->bIdrPeriodFlag = false;
+	if( pSvcParam->uiIntraPeriod )
+		pEncCtx->pVaa->bIdrPeriodFlag = ( 1 + pEncCtx->iFrameIndex >= (int32_t)pSvcParam->uiIntraPeriod ) ? true : false;		
+
+	if ( m_bOfficialBranch )	// Perform Down Sampling potentially due to application
+	{
+		assert( kiConfiguredLayerNum == 1 );
+		iSpatialNum	= SingleLayerPreprocess( pEncCtx, kppSrcPicList[0], &m_sScaledPicture );
+	}
+	else // for console each spatial pictures are available there
+	{
+		iSpatialNum	= kiConfiguredLayerNum;
+		MultiLayerPreprocess( pEncCtx, kppSrcPicList, iSpatialNum );	
+	}
+
+	return iSpatialNum;
+}
+
+int32_t CWelsPreProcess::WelsPreprocessStep3( void *pCtx, const int32_t kiDidx )
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
+	SWelsSvcCodingParam *pSvcParam = pEncCtx->pSvcParam;	
+	bool_t bNeededMbAq = (pSvcParam->bEnableAdaptiveQuant && (pEncCtx->eSliceType == P_SLICE));
+	bool_t bCalculateBGD = (pEncCtx->eSliceType == P_SLICE && pSvcParam->bEnableBackgroundDetection);
+		
+	int32_t iCurTemporalIdx  = pEncCtx->uiSpatialLayersInTemporal[kiDidx] - 1;
+
+	int32_t iRefTemporalIdx = (int32_t)g_kuiRefTemporalIdx[pSvcParam->iDecompStages][pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1)];
+	if ( pEncCtx->uiTemporalId == 0 && pEncCtx->pLtr[pEncCtx->uiDependencyId].bReceivedT0LostFlag )	
+		iRefTemporalIdx = pEncCtx->uiSpatialLayersInTemporal[kiDidx] + pEncCtx->pVaa->uiValidLongTermPicIdx;
+
+	SPicture *pCurPic = pEncCtx->pSpatialPic[kiDidx][iCurTemporalIdx];
+	SPicture *pRefPic = pEncCtx->pSpatialPic[kiDidx][iRefTemporalIdx];	
+	{			
+		SPicture *pLastPic= m_pLastSpatialPicture[kiDidx][0];
+		bool_t bCalculateSQDiff = ((pLastPic->pData[0] == pRefPic->pData[0]) && bNeededMbAq);
+		bool_t bCalculateVar = (pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == I_SLICE);
+
+		VaaCalculation( pEncCtx->pVaa, pCurPic, pRefPic, bCalculateSQDiff, bCalculateVar, bCalculateBGD);
+	}
+
+	if (pSvcParam->bEnableBackgroundDetection)
+	{
+		BackgroundDetection(pEncCtx->pVaa, pCurPic, pRefPic, bCalculateBGD && pRefPic->iPictureType != I_SLICE);
+	}
+
+	if ( bNeededMbAq )
+	{
+		SPicture *pCurPic = m_pLastSpatialPicture[kiDidx][1];
+		SPicture *pRefPic = m_pLastSpatialPicture[kiDidx][0];
+
+		AdaptiveQuantCalculation( pEncCtx->pVaa, pCurPic, pRefPic );           
+	}	
+
+	if ( pSvcParam->bEnableRc )
+	{
+		AnalyzePictureComplexity( pEncCtx, pCurPic, pRefPic, kiDidx, bCalculateBGD );	
+	}
+
+	WelsExchangeSpatialPictures( &m_pLastSpatialPicture[kiDidx][1], &m_pLastSpatialPicture[kiDidx][0] );
+
+	return 0;
+}
+
+
+/*
+*	SingleLayerPreprocess: down sampling if applicable
+*  @return:	exact number of spatial layers need to encoder indeed
+*/
+int32_t CWelsPreProcess::SingleLayerPreprocess( void *pCtx, const SSourcePicture *kpSrc, Scaled_Picture * pScaledPicture )
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
+	SWelsSvcCodingParam *pSvcParam    = pEncCtx->pSvcParam;	
+	int8_t	iDependencyId			= pSvcParam->iNumDependencyLayer - 1;
+	int32_t iPicturePos	                    = pEncCtx->uiSpatialLayersInTemporal[iDependencyId] - 1;
+
+	SPicture *pSrcPic					= NULL;	// large
+	SPicture *pDstPic					= NULL;	// small
+	SDLayerParam *pDlayerParam					= NULL;
+	int32_t iSpatialNum					= 0;
+	int32_t iSrcWidth					= 0;
+	int32_t iSrcHeight					= 0;
+	int32_t iTargetWidth					= 0;
+	int32_t iTargetHeight					= 0;		
+	int32_t iTemporalId = 0;
+	int32_t iActualSpatialLayerNum      = 0;
+
+	pDlayerParam = &pSvcParam->sDependencyLayers[iDependencyId];
+	iTargetWidth	  = pDlayerParam->iFrameWidth;
+	iTargetHeight  = pDlayerParam->iFrameHeight;	
+	iTemporalId    = pDlayerParam->uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1)];	
+	iSrcWidth   = pSvcParam->SUsedPicRect.iWidth;
+	iSrcHeight  = pSvcParam->SUsedPicRect.iHeight;
+	
+	pSrcPic = pScaledPicture->pScaledInputPicture ? pScaledPicture->pScaledInputPicture : pEncCtx->pSpatialPic[iDependencyId][iPicturePos];
+
+	WelsMoveMemoryWrapper( pSvcParam, pSrcPic, kpSrc, iSrcWidth, iSrcHeight );
+
+	if( pSvcParam->bEnableDenoise )
+		BilateralDenoising(pSrcPic, iSrcWidth, iSrcHeight);
+
+	// different scaling in between input picture and dst highest spatial picture. 
+	int32_t iShrinkWidth  = iSrcWidth;
+	int32_t iShrinkHeight = iSrcHeight;
+	pDstPic = pSrcPic;
+	if ( pScaledPicture->pScaledInputPicture )
+	{	
+		// for highest downsampling				
+		pDstPic		= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];			
+		iShrinkWidth = pScaledPicture->iScaledWidth[iDependencyId];
+		iShrinkHeight = pScaledPicture->iScaledHeight[iDependencyId];			
+	}
+	DownsamplePadding(pSrcPic, pDstPic, iSrcWidth, iSrcHeight, iShrinkWidth, iShrinkHeight, iTargetWidth, iTargetHeight);	
+
+	if(pSvcParam->bEnableSceneChangeDetect && !pEncCtx->pVaa->bIdrPeriodFlag && !(pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1))){
+		SPicture *pRefPic = pEncCtx->pLtr[iDependencyId].bReceivedT0LostFlag ? 
+			pEncCtx->pSpatialPic[iDependencyId][pEncCtx->uiSpatialLayersInTemporal[iDependencyId] + pEncCtx->pVaa->uiValidLongTermPicIdx] : m_pLastSpatialPicture[iDependencyId][0];
+
+		pEncCtx->pVaa->bSceneChangeFlag = DetectSceneChange(pDstPic, pRefPic);		
+	}
+
+	for( int32_t i=0;i<pSvcParam->iNumDependencyLayer;i++ ){
+		if( pSvcParam->sDependencyLayers[i].uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1)]
+			!= INVALID_TEMPORAL_ID ){
+			++ iActualSpatialLayerNum;
+		}
+	}
+
+	if ( iTemporalId != INVALID_TEMPORAL_ID )
+	{
+		WelsUpdateSpatialIdxMap(pEncCtx, iActualSpatialLayerNum - 1, pDstPic, iDependencyId);	
+		++ iSpatialNum;
+		-- iActualSpatialLayerNum;
+	}	
+
+	m_pLastSpatialPicture[iDependencyId][1]	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];	
+	-- iDependencyId;
+
+	// generate other spacial layer
+	// pSrc is 
+	//	-- padded input pic, if downsample should be applied to generate highest layer, [if] block above
+	//	-- highest layer, if no downsampling, [else] block above
+	if ( pSvcParam->iNumDependencyLayer > 1 )
+	{
+		while (iDependencyId >= 0) 
+		{
+			pDlayerParam			= &pSvcParam->sDependencyLayers[iDependencyId];
+			iTargetWidth	= pDlayerParam->iFrameWidth;
+			iTargetHeight	= pDlayerParam->iFrameHeight;					
+			iTemporalId = pDlayerParam->uiCodingIdx2TemporalId[pEncCtx->iCodingIndex & (pSvcParam->uiGopSize-1)];
+			iPicturePos		= pEncCtx->uiSpatialLayersInTemporal[iDependencyId] - 1;
+
+			// NOT work for CGS, FIXME
+			// spatial layer is able to encode indeed
+			if ( (iTemporalId != INVALID_TEMPORAL_ID) )
+			{ 
+				// down sampling performed
+				if( NULL == pSrcPic )
+					return -1;
+
+				pDstPic	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];	// small
+				iShrinkWidth = pScaledPicture->iScaledWidth[iDependencyId];
+				iShrinkHeight = pScaledPicture->iScaledHeight[iDependencyId];
+				DownsamplePadding(pSrcPic, pDstPic, iSrcWidth, iSrcHeight, iShrinkWidth, iShrinkHeight, iTargetWidth, iTargetHeight);
+
+				WelsUpdateSpatialIdxMap(pEncCtx, iActualSpatialLayerNum - 1, pDstPic, iDependencyId);				
+
+				-- iActualSpatialLayerNum;
+				++ iSpatialNum;				
+
+				m_pLastSpatialPicture[iDependencyId][1]	= pEncCtx->pSpatialPic[iDependencyId][iPicturePos];	
+			}
+			-- iDependencyId;
+		}		
+	}
+
+	return iSpatialNum;
+}
+
+int32_t CWelsPreProcess::MultiLayerPreprocess( void *pCtx, const SSourcePicture **kppSrcPicList, const int32_t kiSpatialNum )
+{
+	sWelsEncCtx *pEncCtx = (sWelsEncCtx *)pCtx;
+	SWelsSvcCodingParam *pSvcParam	= pEncCtx->pSvcParam;		
+	const SSourcePicture *pSrc			= NULL;
+	SPicture *pDstPic						= NULL;
+	const int32_t iSpatialLayersCfgCount = pSvcParam->iNumDependencyLayer;	// count number of spatial layers to be encoded in cfg
+	int32_t i							= 0;
+	int32_t j							= -1;
+
+	do {
+		pSrc	= kppSrcPicList[i];
+
+		// do not clear j, just let it continue to save complexity
+		do {
+			++ j;
+			if ( pSvcParam->sDependencyLayers[j].iFrameWidth == pSrc->iPicWidth &&
+				pSvcParam->sDependencyLayers[j].iFrameHeight== pSrc->iPicHeight )
+			{
+				break;
+			}			
+		} while( j < iSpatialLayersCfgCount );
+
+		assert( j < iSpatialLayersCfgCount );
+		pDstPic = pEncCtx->pSpatialPic[j][pEncCtx->uiSpatialLayersInTemporal[j]-1];
+		
+		WelsUpdateSpatialIdxMap(pEncCtx, i, pDstPic, j);		
+
+		WelsMoveMemoryWrapper( pSvcParam, pDstPic, pSrc, pSrc->iPicWidth, pSrc->iPicHeight );
+
+		if(pSvcParam->bEnableDenoise)
+			BilateralDenoising(pDstPic, pSrc->iPicWidth, pSrc->iPicHeight);
+
+		m_pLastSpatialPicture[j][1]	= pDstPic;
+		++ i;
+	} while( i < kiSpatialNum );
+
+	if( pSvcParam->bEnableSceneChangeDetect && (kiSpatialNum == pSvcParam->iNumDependencyLayer) && !pEncCtx->pVaa->bIdrPeriodFlag )
+	{
+		SPicture *pRef = pEncCtx->pLtr[0].bReceivedT0LostFlag ? 
+			pEncCtx->pSpatialPic[0][pEncCtx->uiSpatialLayersInTemporal[0] + pEncCtx->pVaa->uiValidLongTermPicIdx] : m_pLastSpatialPicture[0][0];
+
+		pEncCtx->pVaa->bSceneChangeFlag = DetectSceneChange(pDstPic, pRef);
+	}
+
+	return 0;
+}
+
+/*!
+ * \brief	Whether input picture need be scaled?	
+ */
+bool_t JudgeNeedOfScaling( SWelsSvcCodingParam *pParam, Scaled_Picture * pScaledPicture )
+{
+	const int32_t kiInputPicWidth	= pParam->SUsedPicRect.iWidth;
+	const int32_t kiInputPicHeight = pParam->SUsedPicRect.iHeight;
+	const int32_t kiDstPicWidth		= pParam->sDependencyLayers[pParam->iNumDependencyLayer-1].iActualWidth;
+	const int32_t kiDstPicHeight	= pParam->sDependencyLayers[pParam->iNumDependencyLayer-1].iActualHeight;
+	bool_t bNeedDownsampling = true;
+
+	int32_t iSpatialIdx = pParam->iNumDependencyLayer-1;
+
+	if ( kiDstPicWidth >= kiInputPicWidth && kiDstPicHeight >= kiInputPicHeight )
+	{
+		iSpatialIdx --;  // highest D layer do not need downsampling
+		bNeedDownsampling = false;
+	}
+
+	for(; iSpatialIdx >= 0; iSpatialIdx --)
+	{
+		SDLayerParam *pCurLayer = &pParam->sDependencyLayers[iSpatialIdx];
+		int32_t iCurDstWidth			= pCurLayer->iActualWidth; 
+		int32_t iCurDstHeight			= pCurLayer->iActualHeight;
+		int32_t iInputWidthXDstHeight	= kiInputPicWidth * iCurDstHeight;
+		int32_t iInputHeightXDstWidth	= kiInputPicHeight * iCurDstWidth;
+
+		if (iInputWidthXDstHeight > iInputHeightXDstWidth)
+		{
+			pScaledPicture->iScaledWidth[iSpatialIdx] = iCurDstWidth;
+			pScaledPicture->iScaledHeight[iSpatialIdx] = iInputHeightXDstWidth / kiInputPicWidth;
+		}else {
+			pScaledPicture->iScaledWidth[iSpatialIdx] = iInputWidthXDstHeight / kiInputPicHeight;
+			pScaledPicture->iScaledHeight[iSpatialIdx] = iCurDstHeight;
+		}
+	}
+
+	return bNeedDownsampling;
+}
+
+int32_t  WelsInitScaledPic( SWelsSvcCodingParam *pParam,  Scaled_Picture  *pScaledPicture, CMemoryAlign *pMemoryAlign )
+{
+	bool_t bInputPicNeedScaling = JudgeNeedOfScaling( pParam, pScaledPicture );
+    if( bInputPicNeedScaling )
+    {
+        pScaledPicture->pScaledInputPicture = AllocPicture(pMemoryAlign, pParam->SUsedPicRect.iWidth, pParam->SUsedPicRect.iHeight, false);
+        if( pScaledPicture->pScaledInputPicture == NULL )           
+            return -1;
+    }
+    return 0;
+}
+
+void  FreeScaledPic(Scaled_Picture  *pScaledPicture, CMemoryAlign *pMemoryAlign)
+{
+	if ( pScaledPicture->pScaledInputPicture )
+	{
+		FreePicture( pMemoryAlign, &pScaledPicture->pScaledInputPicture );	
+		pScaledPicture->pScaledInputPicture = NULL;
+	}			
+}
+
+int32_t CWelsPreProcess::InitLastSpatialPictures( void *pCtx )
+{
+	sWelsEncCtx *pEncCtx         = (sWelsEncCtx *)pCtx;
+	SWelsSvcCodingParam *pParam	= pEncCtx->pSvcParam;
+	const int32_t kiDlayerCount			= pParam->iNumDependencyLayer;
+	int32_t iDlayerIndex					= 0;
+
+	for (; iDlayerIndex<kiDlayerCount; iDlayerIndex++)
+	{
+		const int32_t kiLayerInTemporal = pEncCtx->uiSpatialLayersInTemporal[iDlayerIndex];
+		m_pLastSpatialPicture[iDlayerIndex][0]	= pEncCtx->pSpatialPic[iDlayerIndex][kiLayerInTemporal - 2];
+		m_pLastSpatialPicture[iDlayerIndex][1]	= NULL;
+	}
+	for (; iDlayerIndex<MAX_DEPENDENCY_LAYER; iDlayerIndex++)
+	{
+		m_pLastSpatialPicture[iDlayerIndex][0]	= m_pLastSpatialPicture[iDlayerIndex][1] = NULL;
+	}
+
+	return 0;
+}
+//*********************************************************************************************************/
+
+int32_t CWelsPreProcess::ColorspaceConvert(SWelsSvcCodingParam * pSvcParam, SPicture *pDstPic, const SSourcePicture *kpSrc, const int32_t kiWidth, const int32_t kiHeight )
+{
+	return 1;
+	//not support yet
+}
+
+void CWelsPreProcess::BilateralDenoising ( SPicture *pSrc, const int32_t kiWidth, const int32_t kiHeight )
+{
+	int32_t iMethodIdx = METHOD_DENOISE;
+	SPixMap sSrcPixMap = {0};
+
+	sSrcPixMap.pPixel[0] = pSrc->pData[0];
+	sSrcPixMap.pPixel[1] = pSrc->pData[1];
+	sSrcPixMap.pPixel[2] = pSrc->pData[2];
+	sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+	sSrcPixMap.sRect.iRectWidth = kiWidth;
+	sSrcPixMap.sRect.iRectHeight = kiHeight;
+	sSrcPixMap.iStride[0] = pSrc->iLineSize[0];
+	sSrcPixMap.iStride[1] = pSrc->iLineSize[1];
+	sSrcPixMap.iStride[2] = pSrc->iLineSize[2];
+	sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
+
+	m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, NULL);
+}
+
+bool_t CWelsPreProcess::DetectSceneChange( SPicture *pCurPicture, SPicture *pRefPicture )
+{
+	bool_t bSceneChangeFlag = false;
+	int32_t iMethodIdx = METHOD_SCENE_CHANGE_DETECTION;
+	SSceneChangeResult sSceneChangeDetectResult = {0};
+	SPixMap sSrcPixMap = {0};
+	SPixMap sRefPixMap = {0};
+
+	sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
+	sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+	sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
+	sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+	sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+	sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
+
+
+	sRefPixMap.pPixel[0] = pRefPicture->pData[0]; 
+	sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+	sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
+	sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+	sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+	sRefPixMap.eFormat = VIDEO_FORMAT_I420;
+
+	int32_t iRet = m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, &sRefPixMap);
+	if (iRet == 0)
+	{
+		m_pInterfaceVp->Get(iMethodIdx, (void*)&sSceneChangeDetectResult);
+		bSceneChangeFlag = sSceneChangeDetectResult.bSceneChangeFlag ? true : false;
+	}
+
+	return bSceneChangeFlag;
+}
+
+int32_t CWelsPreProcess::DownsamplePadding( SPicture *pSrc, SPicture *pDstPic,  int32_t iSrcWidth, int32_t iSrcHeight,
+											int32_t iShrinkWidth, int32_t iShrinkHeight, int32_t iTargetWidth, int32_t iTargetHeight )
+{
+	int32_t iRet = 0;
+	SPixMap sSrcPixMap = {0};
+	SPixMap sDstPicMap = {0};
+
+	sSrcPixMap.pPixel[0]   = pSrc->pData[0];
+	sSrcPixMap.pPixel[1]   = pSrc->pData[1];
+	sSrcPixMap.pPixel[2]   = pSrc->pData[2];
+	sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+	sSrcPixMap.sRect.iRectWidth  = iSrcWidth;
+	sSrcPixMap.sRect.iRectHeight = iSrcHeight;
+	sSrcPixMap.iStride[0]  = pSrc->iLineSize[0];
+	sSrcPixMap.iStride[1]  = pSrc->iLineSize[1];
+	sSrcPixMap.iStride[2]  = pSrc->iLineSize[2];
+	sSrcPixMap.eFormat     = VIDEO_FORMAT_I420;	
+
+	if (iSrcWidth != iShrinkWidth || iSrcHeight != iShrinkHeight)
+	{
+		int32_t iMethodIdx = METHOD_DOWNSAMPLE;
+		sDstPicMap.pPixel[0]   = pDstPic->pData[0];
+		sDstPicMap.pPixel[1]   = pDstPic->pData[1];
+		sDstPicMap.pPixel[2]   = pDstPic->pData[2];
+		sDstPicMap.iSizeInBits = g_kiPixMapSizeInBits;
+		sDstPicMap.sRect.iRectWidth  = iShrinkWidth;
+		sDstPicMap.sRect.iRectHeight = iShrinkHeight;
+		sDstPicMap.iStride[0]  = pDstPic->iLineSize[0];
+		sDstPicMap.iStride[1]  = pDstPic->iLineSize[1];
+		sDstPicMap.iStride[2]  = pDstPic->iLineSize[2];
+		sDstPicMap.eFormat     = VIDEO_FORMAT_I420;
+
+		iRet = m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, &sDstPicMap);
+	}	
+	else
+	{
+        memcpy(&sDstPicMap, &sSrcPixMap, sizeof(sDstPicMap));	// confirmed_safe_unsafe_usage
+	}
+
+	// get rid of odd line
+	iShrinkWidth -= (iShrinkWidth & 1);
+	iShrinkHeight -= (iShrinkHeight & 1);
+	Padding( (uint8_t *)sDstPicMap.pPixel[0], (uint8_t *)sDstPicMap.pPixel[1], (uint8_t *)sDstPicMap.pPixel[2], 
+		sDstPicMap.iStride[0], sDstPicMap.iStride[1],	iShrinkWidth, iTargetWidth, iShrinkHeight, iTargetHeight);
+
+	return iRet;
+}
+
+//*********************************************************************************************************/
+void CWelsPreProcess::VaaCalculation(SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture,
+                                     bool_t bCalculateSQDiff, bool_t bCalculateVar, bool_t bCalculateBGD)
+{
+	pVaaInfo->sVaaCalcInfo.pCurY = pCurPicture->pData[0];
+	pVaaInfo->sVaaCalcInfo.pRefY = pRefPicture->pData[0];
+	{
+		int32_t iMethodIdx = METHOD_VAA_STATISTICS;
+		SPixMap sCurPixMap = {0};
+		SPixMap sRefPixMap = {0};
+		SVAACalcParam calc_param = {0};
+
+		sCurPixMap.pPixel[0] = pCurPicture->pData[0];
+		sCurPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+		sCurPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+		sCurPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+		sCurPixMap.iStride[0] = pCurPicture->iLineSize[0];
+		sCurPixMap.eFormat = VIDEO_FORMAT_I420;
+
+		sRefPixMap.pPixel[0] = pRefPicture->pData[0];
+		sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+		sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+		sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+		sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
+		sRefPixMap.eFormat = VIDEO_FORMAT_I420;
+
+		calc_param.iCalcVar	= bCalculateVar;
+		calc_param.iCalcBgd	= bCalculateBGD;
+		calc_param.iCalcSsd	= bCalculateSQDiff;
+		calc_param.pCalcResult = &pVaaInfo->sVaaCalcInfo;
+
+		m_pInterfaceVp->Set(iMethodIdx, &calc_param);
+		m_pInterfaceVp->Process(iMethodIdx, &sCurPixMap, &sRefPixMap);
+	} 
+}
+
+void CWelsPreProcess::BackgroundDetection( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture, bool_t bDetectFlag )
+{
+	if (bDetectFlag)
+	{
+		pVaaInfo->iPicWidth     = pCurPicture->iWidthInPixel;
+		pVaaInfo->iPicHeight    = pCurPicture->iHeightInPixel;
+
+		pVaaInfo->iPicStride	= pCurPicture->iLineSize[0];
+		pVaaInfo->iPicStrideUV	= pCurPicture->iLineSize[1];
+		pVaaInfo->pCurY			= pCurPicture->pData[0];
+		pVaaInfo->pRefY			= pRefPicture->pData[0];	
+		pVaaInfo->pCurU			= pCurPicture->pData[1];
+		pVaaInfo->pRefU			= pRefPicture->pData[1];	
+		pVaaInfo->pCurV			= pCurPicture->pData[2];
+		pVaaInfo->pRefV			= pRefPicture->pData[2];	
+
+		int32_t iMethodIdx = METHOD_BACKGROUND_DETECTION;
+		SPixMap sSrcPixMap = {0};
+		SPixMap sRefPixMap = {0};
+		SBGDInterface BGDParam = {0};
+
+		sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
+		sSrcPixMap.pPixel[1] = pCurPicture->pData[1];
+		sSrcPixMap.pPixel[2] = pCurPicture->pData[2];
+		sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+		sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
+		sSrcPixMap.iStride[1] = pCurPicture->iLineSize[1];
+		sSrcPixMap.iStride[2] = pCurPicture->iLineSize[2];
+		sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+		sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+		sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
+
+		sRefPixMap.pPixel[0] = pRefPicture->pData[0];
+		sRefPixMap.pPixel[1] = pRefPicture->pData[1];
+		sRefPixMap.pPixel[2] = pRefPicture->pData[2];
+		sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+		sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
+		sRefPixMap.iStride[1] = pRefPicture->iLineSize[1];
+		sRefPixMap.iStride[2] = pRefPicture->iLineSize[2];
+		sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+		sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+		sRefPixMap.eFormat = VIDEO_FORMAT_I420;
+
+		BGDParam.pBackgroundMbFlag = pVaaInfo->pVaaBackgroundMbFlag;
+		BGDParam.pCalcRes = &(pVaaInfo->sVaaCalcInfo);
+		m_pInterfaceVp->Set(iMethodIdx, (void*)&BGDParam);
+		m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, &sRefPixMap);
+	} 
+	else
+	{
+		int32_t	iPicWidthInMb	= (pCurPicture->iWidthInPixel + 15) >> 4;
+		int32_t	iPicHeightInMb= (pCurPicture->iHeightInPixel+ 15) >> 4;
+		memset(pVaaInfo->pVaaBackgroundMbFlag, 0, iPicWidthInMb * iPicHeightInMb);
+	}
+}
+
+void CWelsPreProcess::AdaptiveQuantCalculation( SVAAFrameInfo *pVaaInfo, SPicture *pCurPicture, SPicture *pRefPicture )
+{
+	pVaaInfo->sAdaptiveQuantParam.pCalcResult = &(pVaaInfo->sVaaCalcInfo); 
+	pVaaInfo->sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = 0;
+
+	{
+		int32_t iMethodIdx = METHOD_ADAPTIVE_QUANT;
+		SPixMap pSrc = {0};
+		SPixMap pRef = {0};
+		int32_t iRet = 0;
+
+		pSrc.pPixel[0] = pCurPicture->pData[0];
+		pSrc.iSizeInBits = g_kiPixMapSizeInBits;
+		pSrc.iStride[0] = pCurPicture->iLineSize[0];
+		pSrc.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+		pSrc.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+		pSrc.eFormat = VIDEO_FORMAT_I420;
+
+		pRef.pPixel[0] = pRefPicture->pData[0]; 
+		pRef.iSizeInBits = g_kiPixMapSizeInBits;
+		pRef.iStride[0] = pRefPicture->iLineSize[0];
+		pRef.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+		pRef.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+		pRef.eFormat = VIDEO_FORMAT_I420;
+
+		iRet = m_pInterfaceVp->Set(iMethodIdx, (void*)&(pVaaInfo->sAdaptiveQuantParam));
+		iRet = m_pInterfaceVp->Process(iMethodIdx, &pSrc, &pRef);
+		if (iRet == 0)
+			m_pInterfaceVp->Get(iMethodIdx, (void*)&(pVaaInfo->sAdaptiveQuantParam));
+	}
+}
+
+void CWelsPreProcess::SetRefMbType( void *pCtx, uint32_t **pRefMbTypeArray, int32_t iRefPicType )
+{
+  sWelsEncCtx *pEncCtx	    = (sWelsEncCtx *)pCtx;
+  const uint8_t uiTid		    = pEncCtx->uiTemporalId;	
+  const uint8_t uiDid          = pEncCtx->uiDependencyId;
+  SRefList *pRefPicLlist				= pEncCtx->ppRefPicListExt[uiDid];	
+  SLTRState* pLtr				= &pEncCtx->pLtr[uiDid];
+  uint8_t i							= 0;
+
+  if (pEncCtx->pSvcParam->bEnableLongTermReference && pLtr->bReceivedT0LostFlag && uiTid == 0)
+  {
+    for ( i = 0;i <pRefPicLlist->uiLongRefCount;i++ )	
+    {
+      SPicture *pRef = pRefPicLlist->pLongRefList[i];
+      if ( pRef != NULL && pRef->uiRecieveConfirmed == 1/*RECIEVE_SUCCESS*/)	
+      {
+        *pRefMbTypeArray = pRef->uiRefMbType;
+        break;
+      }
+    }
+  }
+  else
+  {
+    for ( i = 0; i < pRefPicLlist->uiShortRefCount; i++ )
+    {
+      SPicture *pRef = pRefPicLlist->pShortRefList[i];
+      if ( pRef != NULL && pRef->bUsedAsRef && pRef->iFramePoc >= 0 && pRef->uiTemporalId <= uiTid)
+      {		
+        *pRefMbTypeArray = pRef->uiRefMbType;
+        break;	
+      }
+    }
+  }
+}
+
+
+void CWelsPreProcess::AnalyzePictureComplexity( void *pCtx, SPicture *pCurPicture, SPicture *pRefPicture, const int32_t kiDependencyId, const bool_t bCalculateBGD )
+{
+	sWelsEncCtx *pEncCtx	= (sWelsEncCtx *)pCtx;
+	SWelsSvcCodingParam *pSvcParam= pEncCtx->pSvcParam;
+	SVAAFrameInfo *pVaaInfo			= pEncCtx->pVaa;
+
+	SComplexityAnalysisParam *sComplexityAnalysisParam = &(pVaaInfo->sComplexityAnalysisParam);
+	SWelsSvcRc *SWelsSvcRc = &pEncCtx->pWelsSvcRc[kiDependencyId];
+	int32_t iComplexityAnalysisMode = 0;
+
+	if( pSvcParam->iRCMode == RC_MODE0 && pEncCtx->eSliceType == P_SLICE )
+	{
+		iComplexityAnalysisMode = FRAME_SAD;
+	}
+	else if ( pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == P_SLICE )
+	{
+		iComplexityAnalysisMode = GOM_SAD;
+	}
+	else if ( pSvcParam->iRCMode == RC_MODE1 && pEncCtx->eSliceType == I_SLICE )
+	{
+		iComplexityAnalysisMode = GOM_VAR;
+	}
+	else
+	{
+		return;
+	}
+
+	sComplexityAnalysisParam->iComplexityAnalysisMode = iComplexityAnalysisMode;
+	sComplexityAnalysisParam->pCalcResult = &(pVaaInfo->sVaaCalcInfo); 
+	sComplexityAnalysisParam->pBackgroundMbFlag = pVaaInfo->pVaaBackgroundMbFlag;
+    SetRefMbType(pEncCtx, &(sComplexityAnalysisParam->uiRefMbType), pRefPicture->iPictureType);
+	sComplexityAnalysisParam->iCalcBgd = bCalculateBGD; 
+	sComplexityAnalysisParam->iFrameComplexity = 0;
+
+	memset(SWelsSvcRc->pGomForegroundBlockNum, 0, SWelsSvcRc->iGomSize*sizeof(int32_t));
+	if ( iComplexityAnalysisMode != FRAME_SAD )
+		memset( SWelsSvcRc->pCurrentFrameGomSad, 0, SWelsSvcRc->iGomSize*sizeof(int32_t) );
+
+	sComplexityAnalysisParam->pGomComplexity = SWelsSvcRc->pCurrentFrameGomSad;
+	sComplexityAnalysisParam->pGomForegroundBlockNum = SWelsSvcRc->pGomForegroundBlockNum;
+	sComplexityAnalysisParam->iMbNumInGom = SWelsSvcRc->iNumberMbGom;
+
+	{
+		int32_t iMethodIdx = METHOD_COMPLEXITY_ANALYSIS;
+		SPixMap sSrcPixMap = {0};
+		SPixMap sRefPixMap = {0};
+		int32_t iRet = 0;
+
+		sSrcPixMap.pPixel[0] = pCurPicture->pData[0];
+		sSrcPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+		sSrcPixMap.iStride[0] = pCurPicture->iLineSize[0];
+		sSrcPixMap.sRect.iRectWidth = pCurPicture->iWidthInPixel;
+		sSrcPixMap.sRect.iRectHeight = pCurPicture->iHeightInPixel;
+		sSrcPixMap.eFormat = VIDEO_FORMAT_I420;
+
+		sRefPixMap.pPixel[0] = pRefPicture->pData[0]; 
+		sRefPixMap.iSizeInBits = g_kiPixMapSizeInBits;
+		sRefPixMap.iStride[0] = pRefPicture->iLineSize[0];
+		sRefPixMap.sRect.iRectWidth = pRefPicture->iWidthInPixel;
+		sRefPixMap.sRect.iRectHeight = pRefPicture->iHeightInPixel;
+		sRefPixMap.eFormat = VIDEO_FORMAT_I420;
+
+		iRet = m_pInterfaceVp->Set(iMethodIdx, (void*)sComplexityAnalysisParam);
+		iRet = m_pInterfaceVp->Process(iMethodIdx, &sSrcPixMap, &sRefPixMap);
+		if (iRet == 0)
+			m_pInterfaceVp->Get(iMethodIdx, (void*)sComplexityAnalysisParam);
+	}
+}
+
+void  CWelsPreProcess::Padding(uint8_t * pSrcY, uint8_t * pSrcU, uint8_t * pSrcV, int32_t iStrideY, int32_t iStrideUV,
+			  int32_t iActualWidth, int32_t iPaddingWidth, int32_t iActualHeight, int32_t iPaddingHeight)
+{
+	int32_t i;
+
+	if( iPaddingHeight > iActualHeight ){
+		for( i=iActualHeight;i<iPaddingHeight;i++ ){
+			memset(pSrcY + i*iStrideY, 0, iActualWidth);			
+
+			if( !(i&1) ){
+				memset(pSrcU + i/2*iStrideUV, 0x80, iActualWidth/2);
+				memset(pSrcV + i/2*iStrideUV, 0x80, iActualWidth/2);	
+			}
+		}		
+	}
+
+	if( iPaddingWidth > iActualWidth ){
+		for( i=0;i<iPaddingHeight;i++ ){
+			memset(pSrcY + i*iStrideY + iActualWidth, 0, iPaddingWidth - iActualWidth);
+			if( !(i&1) ){
+				memset(pSrcU + i/2*iStrideUV + iActualWidth/2, 0x80, (iPaddingWidth - iActualWidth)/2);
+				memset(pSrcV + i/2*iStrideUV + iActualWidth/2, 0x80, (iPaddingWidth - iActualWidth)/2);
+			}
+		}        
+	}
+}
+
+
+//TODO: may opti later
+//TODO: not use this func?
+void * WelsMemcpy( void *dst, const void *kpSrc, uint32_t uiSize)
+{
+	return ::memcpy(dst, kpSrc, uiSize);
+}
+void * WelsMemset( void * p, int32_t val, uint32_t uiSize)
+{
+	return ::memset(p, val, uiSize);
+}
+
+//i420_to_i420_c
+void  WelsMoveMemory_c(uint8_t * pDstY, uint8_t * pDstU, uint8_t * pDstV,  int32_t iDstStrideY, int32_t iDstStrideUV,  
+                               uint8_t * pSrcY, uint8_t * pSrcU, uint8_t * pSrcV, int32_t iSrcStrideY, int32_t iSrcStrideUV, int32_t iWidth, int32_t iHeight )
+{
+	int32_t   iWidth2 = iWidth >> 1;
+	int32_t   iHeight2 = iHeight >> 1;
+	int32_t   j;
+
+	for( j=iHeight;j;j-- )
+	{
+		WelsMemcpy(pDstY, pSrcY, iWidth);
+		pDstY += iDstStrideY;
+		pSrcY += iSrcStrideY;
+	}
+
+	for( j=iHeight2;j;j-- )
+	{
+		WelsMemcpy(pDstU, pSrcU, iWidth2);
+		WelsMemcpy(pDstV, pSrcV, iWidth2);
+		pDstU += iDstStrideUV;
+		pDstV += iDstStrideUV;
+		pSrcU += iSrcStrideUV;
+		pSrcV += iSrcStrideUV;
+	}
+}
+//vp's padding
+void  VPpadding(uint8_t * pSrcPtr, int32_t iCurWidth, int32_t iTargetWidth, int32_t iCurHeight, int32_t iTargetHeight, 
+				int32_t iStride, uint8_t uiStuffValue)
+{
+	uint8_t *pTmp;	
+	if( iTargetWidth > iCurWidth )
+	{
+		pTmp = pSrcPtr + iCurWidth;
+		for( int32_t i = 0; i < iCurHeight; i++ )
+		{
+			WelsMemset(pTmp, uiStuffValue, iTargetWidth - iCurWidth);
+			pTmp += iStride;
+		}        
+	}
+	
+	if( iTargetHeight > iCurHeight )
+	{
+		pTmp = pSrcPtr + iCurHeight * iStride;
+		for( int32_t i = iCurHeight; i < iTargetHeight;i++ )
+		{
+			WelsMemset(pTmp, uiStuffValue, iTargetWidth);
+			pTmp += iStride;
+		}		
+	}
+}
+
+
+void  CWelsPreProcess::WelsMoveMemoryWrapper(SWelsSvcCodingParam * pSvcParam, SPicture *pDstPic, const SSourcePicture *kpSrc, 
+                                             const int32_t kiTargetWidth, const int32_t kiTargetHeight )
+{
+    if (VIDEO_FORMAT_I420!=(kpSrc->iColorFormat & (~VIDEO_FORMAT_VFlip)))
+        return;
+
+    int32_t  iSrcWidth       = kpSrc->iPicWidth;
+    int32_t  iSrcHeight      = kpSrc->iPicHeight;
+
+    if ( iSrcHeight > kiTargetHeight ) 	iSrcHeight = kiTargetHeight;
+    if ( iSrcWidth > kiTargetWidth )		iSrcWidth  = kiTargetWidth;
+
+    // copy from fr26 to fix the odd uiSize failed issue 
+    if( iSrcWidth & 0x1 )		-- iSrcWidth;
+    if( iSrcHeight & 0x1 )		-- iSrcHeight;	
+
+    const int32_t kiSrcTopOffsetY = pSvcParam->SUsedPicRect.iTop;
+    const int32_t kiSrcTopOffsetUV = (kiSrcTopOffsetY>>1);
+    const int32_t kiSrcLeftOffsetY = pSvcParam->SUsedPicRect.iLeft;
+    const int32_t kiSrcLeftOffsetUV = (kiSrcLeftOffsetY>>1);
+    int32_t  iSrcOffset[3]       = {0,0,0};
+    iSrcOffset[0] = kpSrc->iStride[0]*kiSrcTopOffsetY + kiSrcLeftOffsetY;
+    iSrcOffset[1] = kpSrc->iStride[1]*kiSrcTopOffsetUV + kiSrcLeftOffsetUV ;
+    iSrcOffset[2] = kpSrc->iStride[2]*kiSrcTopOffsetUV + kiSrcLeftOffsetUV;
+
+    uint8_t * pSrcY = kpSrc->pData[0] + iSrcOffset[0];
+    uint8_t * pSrcU = kpSrc->pData[1] + iSrcOffset[1];
+    uint8_t * pSrcV = kpSrc->pData[2] + iSrcOffset[2];
+    const int32_t kiSrcStrideY = kpSrc->iStride[0];
+    const int32_t kiSrcStrideUV= kpSrc->iStride[1];
+    
+    uint8_t * pDstY = pDstPic->pData[0];
+    uint8_t * pDstU = pDstPic->pData[1];
+    uint8_t * pDstV = pDstPic->pData[2];
+    const int32_t kiDstStrideY = pDstPic->iLineSize[0];
+    const int32_t kiDstStrideUV = pDstPic->iLineSize[1];
+
+#define MAX_WIDTH      (4096)
+#define MAX_HEIGHT     (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
+    if (pSrcY)
+    {
+        if (iSrcWidth <= 0 || iSrcWidth > MAX_WIDTH || iSrcHeight <= 0 || iSrcHeight > MAX_HEIGHT)
+            return;
+        if (kiSrcTopOffsetY >= iSrcHeight || kiSrcLeftOffsetY>= iSrcWidth || iSrcWidth > kiSrcStrideY )
+            return;
+    }
+    if (pDstY)
+    {
+        if (kiTargetWidth <= 0 || kiTargetWidth > MAX_WIDTH || kiTargetHeight<= 0 || kiTargetHeight> MAX_HEIGHT)
+            return;
+        if (kiTargetWidth > kiDstStrideY)
+            return;
+    }
+
+    if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL || pDstY == NULL || pDstU == NULL || pDstV == NULL
+        || (iSrcWidth & 1) || (iSrcHeight & 1) )
+    {}
+    else
+    { 
+        //i420_to_i420_c
+        WelsMoveMemory_c( pDstY,  pDstU,  pDstV,  kiDstStrideY, kiDstStrideUV,  
+            pSrcY,  pSrcU,  pSrcV, kiSrcStrideY, kiSrcStrideUV, iSrcWidth, iSrcHeight );
+
+        //in VP Process
+        if ( kiTargetWidth > iSrcWidth || kiTargetHeight > iSrcHeight )
+        {
+            const int32_t kiTargetWidthC  = (kiTargetWidth>>1);
+            const int32_t kiTargetHeightC = (kiTargetHeight>>1);
+            const int32_t kiSrcWidthC        = (iSrcWidth>>1);
+            const int32_t kiSrcHeightC       = (iSrcHeight>>1);
+
+            // padding pDstPic I420
+            VPpadding((uint8_t *)pDstY, iSrcWidth, kiTargetWidth, iSrcHeight, kiTargetHeight, kiDstStrideY, 0);
+            VPpadding((uint8_t *)pDstU, kiSrcWidthC, kiTargetWidthC, kiSrcHeightC, kiTargetHeightC, kiDstStrideUV, 0x80);
+            VPpadding((uint8_t *)pDstV, kiSrcWidthC, kiTargetWidthC, kiSrcHeightC, kiTargetHeightC, kiDstStrideUV, 0x80);
+        }
+    }
+
+}
+
+//*********************************************************************************************************/
+} // namespace WelsSVCEnc
--- /dev/null
+++ b/codec/encoder/plus/inc/welsCodecTrace.h
@@ -1,0 +1,78 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_CODEC_TRACE
+#define WELS_CODEC_TRACE
+
+#include <stdarg.h>
+#include "typedefs.h"
+
+#ifdef WIN32
+typedef int32_t ( *CM_WELS_TRACE)( const str_t* format, ...);
+#else
+typedef int32_t ( *CM_WELS_TRACE2)( const str_t* dllname, const str_t* format, ...);
+#endif
+
+class welsCodecTrace
+{
+public:
+	welsCodecTrace();
+	~welsCodecTrace();
+	
+	static void TraceString(int32_t iLevel, const str_t* kpStrFormat);
+	static void CODEC_TRACE(void* pIgnore, const int32_t kiLevel, const str_t* kpStrFormat, va_list vl);
+
+	void SetTraceLevel(const int32_t kiLevel);
+	int32_t WelsTraceModuleIsExist();
+
+private:	
+	
+	int32_t m_WelsTraceExistFlag;
+	void* m_hTraceHandle;
+
+public:
+	static int32_t	m_iTraceLevel;
+#if defined WIN32
+	static CM_WELS_TRACE m_fpDebugTrace;
+	static CM_WELS_TRACE m_fpInfoTrace;
+	static CM_WELS_TRACE m_fpWarnTrace;
+	static CM_WELS_TRACE m_fpErrorTrace;
+#else	
+	static CM_WELS_TRACE2 m_fpDebugTrace;
+	static CM_WELS_TRACE2 m_fpInfoTrace;
+	static CM_WELS_TRACE2 m_fpWarnTrace;
+	static CM_WELS_TRACE2 m_fpErrorTrace;
+#endif
+	
+};
+
+#endif //WELS_CODEC_TRACE
--- /dev/null
+++ b/codec/encoder/plus/inc/welsEncoderExt.h
@@ -1,0 +1,130 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ *  welsCodecTrace.h
+ *
+ *  Abstract
+ *      Cisco OpenH264 encoder extension utilization interface for T26
+ *
+ *  History
+ *      4/24/2009 Created
+ *
+ *
+ *************************************************************************/
+#if !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
+#define AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_
+
+#include "codec_api.h"
+#include "codec_def.h"
+#include "codec_app_def.h"
+#include "welsCodecTrace.h"
+#include "encoder_context.h"
+#include "param_svc.h"
+#include "extern.h"
+
+//#define OUTPUT_BIT_STREAM
+//#define DUMP_SRC_PICTURE
+//#define REC_FRAME_COUNT
+
+class ISVCEncoder;
+namespace WelsSVCEnc {
+class CWelsH264SVCEncoder : public ISVCEncoder  
+{
+public:
+	CWelsH264SVCEncoder();
+	virtual ~CWelsH264SVCEncoder();
+
+	/* Interfaces override from ISVCEncoder */
+	/*
+	 * return: CM_RETURN: 0 - success; otherwise - failed;
+	 */
+	virtual int Initialize(SVCEncodingParam* argv, const INIT_TYPE init_type);
+	virtual int Initialize(void* argv, const INIT_TYPE init_type);
+
+	virtual int Unintialize();
+	
+	/*
+	 * return: EVideoFrameType [IDR: videoFrameTypeIDR; P: videoFrameTypeP; ERROR: videoFrameTypeInvalid]
+	 */
+	virtual int EncodeFrame(const unsigned char* kpSrc, SFrameBSInfo* pBsInfo);
+	virtual int EncodeFrame(const SSourcePicture ** kppSrcPicList, int nSrcPicNum, SFrameBSInfo * pBsInfo);
+	
+	/*
+	 * return: 0 - success; otherwise - failed;
+	 */
+	virtual int PauseFrame(const unsigned char* pSrc, SFrameBSInfo* pBsInfo);	
+	
+	/*
+	 * return: 0 - success; otherwise - failed;
+	 */
+	virtual int ForceIntraFrame(bool bIDR);		
+	
+	/************************************************************************
+	 * InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
+	 ************************************************************************/
+	/*
+	 * return: CM_RETURN: 0 - success; otherwise - failed;
+	 */
+	virtual int SetOption(ENCODER_OPTION opt_id, void* option);
+	virtual int GetOption(ENCODER_OPTION opt_id, void* option);	
+
+private:	
+	sWelsEncCtx	*m_pEncContext;
+
+#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__) 
+	welsCodecTrace			*m_pWelsTrace;
+#endif	
+	SSourcePicture			**m_pSrcPicList;
+	int32_t						m_iSrcListSize;
+
+	int32_t						m_iMaxPicWidth;
+	int32_t						m_iMaxPicHeight;
+	
+	int32_t						m_iCspInternal;
+	BOOL_T					m_bInitialFlag;	
+
+#ifdef OUTPUT_BIT_STREAM
+	FILE*				m_pFileBs;
+	FILE*               m_pFileBsSize;
+	BOOL_T				m_bSwitch;
+	int32_t					m_iSwitchTimes;
+#endif//OUTPUT_BIT_STREAM
+
+#ifdef REC_FRAME_COUNT
+   int32_t		m_uiCountFrameNum;
+#endif//REC_FRAME_COUNT
+	
+	void    InitEncoder( void );	
+	int32_t RawData2SrcPic(const uint8_t * pSrc);
+	void    DumpSrcPicture(const uint8_t *pSrc);
+};
+}
+#endif // !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
--- /dev/null
+++ b/codec/encoder/plus/res/resource.h
@@ -1,0 +1,15 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Developer Studio generated include file.
+// Used by welsenc.rc
+//
+
+// Next default values for new objects
+// 
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        101
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1000
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
--- /dev/null
+++ b/codec/encoder/plus/res/welsenc.rc
@@ -1,0 +1,115 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+#include "afxres.h"
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// Chinese (P.R.C.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_CHS)
+#ifdef _WIN32
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+#pragma code_page(936)
+#endif //_WIN32
+
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE 
+BEGIN
+    "resource.h\0"
+END
+
+2 TEXTINCLUDE 
+BEGIN
+    "#include ""afxres.h""\r\n"
+    "\0"
+END
+
+3 TEXTINCLUDE 
+BEGIN
+    "\r\n"
+    "\0"
+END
+
+#endif    // APSTUDIO_INVOKED
+
+#endif    // Chinese (P.R.C.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////
+// English (U.S.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+#ifdef _WIN32
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+#pragma code_page(1252)
+#endif //_WIN32
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION 0,0,0,0
+ PRODUCTVERSION 0,0,0,0
+ FILEFLAGSMASK 0x3fL
+#ifdef _DEBUG
+ FILEFLAGS 0x1L
+#else
+ FILEFLAGS 0x0L
+#endif
+ FILEOS 0x40004L
+ FILETYPE 0x2L
+ FILESUBTYPE 0x0L
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904b0"
+        BEGIN
+            VALUE "Comments", "Cisco OpenH264 H.264 (Wels)  encoder"
+            VALUE "CompanyName", "Cisco Systems"
+            VALUE "FileDescription", "Cisco OpenH264 H.264  encoder"
+            VALUE "FileVersion", "0, 0, 0, 0"
+            VALUE "InternalName", "welsenc.dll"
+            VALUE "LegalCopyright", "� 2011-2015 Cisco and/or its affiliates. All rights reserved."
+            VALUE "OriginalFilename", "welsenc.dll"
+            VALUE "ProductName", "Cisco OpenH264 H.264  encoder"
+            VALUE "ProductVersion", "0, 0, 0, 0"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1200
+    END
+END
+
+#endif    // English (U.S.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+
+
+/////////////////////////////////////////////////////////////////////////////
+#endif    // not APSTUDIO_INVOKED
+
--- /dev/null
+++ b/codec/encoder/plus/src/DllEntry.cpp
@@ -1,0 +1,51 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <windows.h>
+
+/////////////////////////////////////////////////////////////////////////////
+// DLL Entry Point
+HANDLE g_hInstDll;
+
+BOOL WINAPI DllEntryPoint(HINSTANCE hInstance, DWORD dwReason, LPVOID lpReserved)
+{
+	g_hInstDll = hInstance;
+    if (DLL_PROCESS_ATTACH == dwReason)
+    {
+		DisableThreadLibraryCalls(hInstance);
+	}
+//	else if (DLL_PROCESS_DETACH == dwReason)
+//	{
+//	
+//	}
+    return TRUE;
+}
\ No newline at end of file
--- /dev/null
+++ b/codec/encoder/plus/src/welsCodecTrace.cpp
@@ -1,0 +1,377 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef WIN32
+#include <windows.h>
+#include <tchar.h>
+#endif
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+
+#include "welsCodecTrace.h"
+#include "utils.h"
+#if defined LINUX || defined SOLARIS || defined UNIX || defined MACOS //LINUX/SOLARIS/UNIX
+#include <dlfcn.h>
+#endif
+
+#if defined(MACOS)
+#include <carbon/carbon.h>
+#include <CoreFoundation/CFBundle.h>
+#endif//MACOS
+
+#ifdef WIN32
+extern HANDLE g_hInstDll;
+#endif
+
+//#define CODEC_TRACE_ERROR 0
+//#define CODEC_TRACE_WARNING 1
+//#define CODEC_TRACE_INFO 2
+//#define CODEC_TRACE_DEDBUG 3
+
+using namespace WelsSVCEnc;
+
+#ifdef MACOS
+static CFBundleRef LoadLibrary(const str_t* lpszbundle)
+{
+	// 1.get bundle path
+	str_t cBundlePath[PATH_MAX];
+	memset(cBundlePath, 0, PATH_MAX);
+	
+	Dl_info 	dlInfo;
+	static int32_t  sDummy;
+	dladdr((void*)&sDummy, &dlInfo);
+	
+	strlcpy(cBundlePath, dlInfo.dli_fname, PATH_MAX);	// confirmed_safe_unsafe_usage
+	
+	str_t * pPath = NULL;
+	for(int32_t i = 4; i > 0; i--)
+	{
+		pPath = strrchr(cBundlePath,'/');	// confirmed_safe_unsafe_usage
+		if(pPath)
+		{
+			*pPath = 0;
+		}
+		else
+		{
+			break;
+		}
+	}
+	if(pPath)
+	{
+		strlcat(cBundlePath, "/", PATH_MAX);	// confirmed_safe_unsafe_usage
+	}
+	else
+	{
+		return NULL;
+	}
+	
+	strlcat(cBundlePath, lpszbundle, PATH_MAX);	// confirmed_safe_unsafe_usage
+	
+	FSRef bundlePath;
+	OSStatus iStatus = FSPathMakeRef((uint8_t*)cBundlePath, &bundlePath, NULL);
+	if(noErr != iStatus)
+		return NULL;
+	
+	CFURLRef bundleURL = CFURLCreateFromFSRef(kCFAllocatorSystemDefault, &bundlePath);
+	if(NULL == bundleURL)
+		return NULL;
+	
+	// 2.get bundle pRef
+	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
+	CFRelease(bundleURL);
+	
+//	Boolean bReturn = FALSE;
+	if(NULL != bundleRef)
+	{
+		//	bReturn = CFBundleLoadExecutable(bundleRef);
+	}
+	
+	return bundleRef;
+}
+
+static Boolean FreeLibrary(CFBundleRef bundle)
+{	
+	if(NULL != bundle)
+	{
+		//	CFBundleUnloadExecutable(bundle);
+		CFRelease(bundle);
+	}
+	
+	return TRUE;
+}
+
+static void* GetProcessAddress(CFBundleRef bundle, const str_t* lpszprocname)
+{
+	if(NULL == bundle)
+		return NULL;
+	
+	CFStringRef cfprocname = CFStringCreateWithCString(NULL,lpszprocname,CFStringGetSystemEncoding());
+	void *processAddress = CFBundleGetFunctionPointerForName(bundle,cfprocname);
+	CFRelease(cfprocname);
+	
+	return processAddress;
+}
+#endif
+
+int32_t	welsCodecTrace::m_iTraceLevel			= WELS_LOG_DEFAULT;
+#if defined(WIN32)
+CM_WELS_TRACE welsCodecTrace::m_fpDebugTrace	= NULL;
+CM_WELS_TRACE welsCodecTrace::m_fpInfoTrace	= NULL;
+CM_WELS_TRACE welsCodecTrace::m_fpWarnTrace	= NULL;
+CM_WELS_TRACE welsCodecTrace::m_fpErrorTrace	= NULL;
+#else
+CM_WELS_TRACE2 welsCodecTrace::m_fpDebugTrace= NULL;
+CM_WELS_TRACE2 welsCodecTrace::m_fpInfoTrace	= NULL;
+CM_WELS_TRACE2 welsCodecTrace::m_fpWarnTrace	= NULL;
+CM_WELS_TRACE2 welsCodecTrace::m_fpErrorTrace= NULL;
+#endif//WIN32
+
+welsCodecTrace::welsCodecTrace()
+{
+	m_hTraceHandle = NULL;
+	m_fpDebugTrace = NULL;
+	m_fpInfoTrace = NULL;
+	m_fpWarnTrace = NULL;
+	m_fpErrorTrace = NULL;
+	m_WelsTraceExistFlag	= false;
+	
+#if defined WIN32	
+	HMODULE handle = ::GetModuleHandle("welstrace.dll");
+//	HMODULE handle = ::GetModuleHandle("contrace.dll"); // for c7 
+	if ( NULL == handle )
+		return;
+
+	CHAR achPath[ _MAX_PATH]= {0};
+	GetModuleFileName( (HMODULE)handle, achPath, _MAX_PATH);
+
+	m_hTraceHandle = ::LoadLibrary(achPath);
+	
+	OutputDebugStringA(achPath);
+	if( m_hTraceHandle) {
+		m_fpDebugTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSDEBUGA");
+		m_fpInfoTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSINFOA");
+		m_fpWarnTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSWARNA");
+		m_fpErrorTrace = ( CM_WELS_TRACE)::GetProcAddress( ( HMODULE)m_hTraceHandle, "WELSERRORA");
+	}
+#elif defined MACOS
+	m_hTraceHandle = LoadLibrary("welstrace.bundle");
+	if(m_hTraceHandle) {
+		m_fpDebugTrace = ( CM_WELS_TRACE2)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSDEBUG2");
+		m_fpInfoTrace = ( CM_WELS_TRACE2)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSINFO2");
+		m_fpWarnTrace = ( CM_WELS_TRACE2)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSWARN2");
+		m_fpErrorTrace = ( CM_WELS_TRACE2)GetProcessAddress( (CFBundleRef)m_hTraceHandle, "WELSERROR2");
+	}
+#elif defined LINUX || defined SOLARIS || defined UNIX
+//#else
+//	CCmString	cmPath;
+	str_t achPath[255]= {0};
+	Dl_info		DlInfo;
+	static int32_t	nMmTPAddress;
+    dladdr( &nMmTPAddress, &DlInfo);
+
+	if (NULL == DlInfo.dli_fname)
+		return;
+	STRNCPY(achPath, 255, DlInfo.dli_fname, STRNLEN(DlInfo.dli_fname, 255));	// confirmed_safe_unsafe_usage
+	str_t* p = strrchr(achPath, '/');	// confirmed_safe_unsafe_usage
+	if ( NULL == p )
+		return;
+	const int32_t kiLenTraceName = STRNLEN("/libwelstrace.so", 15);	// confirmed_safe_unsafe_usage
+	const int32_t kiCurPos = p - achPath;
+	if ( kiCurPos + kiLenTraceName < 255 )
+		STRNCPY(p, 254-kiCurPos, "/libwelstrace.so", kiLenTraceName );	// confirmed_safe_unsafe_usage
+	else
+		return;
+
+	m_hTraceHandle = dlopen( achPath, RTLD_LAZY);
+	if (m_hTraceHandle == NULL)
+	{
+		FILE* fp = fopen("/tmp/trace.txt", "a");
+		if(fp)
+		{
+			fprintf(fp, "welsCodecTrace::welsCodecTrace ===> dlopen %s fail, %s\n", achPath, dlerror());
+			fclose(fp);
+		}
+		return;
+	}
+	if (m_hTraceHandle) {
+		m_fpDebugTrace = ( CM_WELS_TRACE2)dlsym( m_hTraceHandle, "WELSDEBUG2");
+		m_fpInfoTrace = ( CM_WELS_TRACE2)dlsym( m_hTraceHandle, "WELSINFO2");
+		m_fpWarnTrace = ( CM_WELS_TRACE2)dlsym( m_hTraceHandle, "WELSWARN2");
+		m_fpErrorTrace = ( CM_WELS_TRACE2)dlsym( m_hTraceHandle, "WELSERROR2");
+		if(m_fpDebugTrace == NULL)
+		{
+			FILE* fp = fopen("/tmp/trace.txt", "a");
+			if(fp)
+			{
+				printf("welsCodecTrace::welsCodecTrace ===> dlsym failed (WELSDEBUG2) , dlerror = %s\n", dlerror());
+				fclose(fp);
+			}
+			return;
+		}
+	}
+#endif
+	if(m_hTraceHandle != NULL)
+	{
+		m_WelsTraceExistFlag	= true;
+	}
+}
+
+welsCodecTrace::~welsCodecTrace()
+{
+#if defined WIN32
+	if( m_hTraceHandle) {
+		::FreeLibrary( ( HMODULE)m_hTraceHandle);
+	}
+#elif defined MACOS
+	if (m_hTraceHandle) {
+		FreeLibrary( (CFBundleRef)m_hTraceHandle);
+	}
+#elif defined LINUX || defined SOLARIS || defined UNIX
+	if (m_hTraceHandle) {
+		::dlclose( m_hTraceHandle);
+	}
+#endif
+
+	m_hTraceHandle = NULL;
+	m_fpDebugTrace = NULL;
+	m_fpInfoTrace = NULL;
+	m_fpWarnTrace = NULL;
+	m_fpErrorTrace = NULL;
+//	g_bWelsLibLoaded = false;
+	m_WelsTraceExistFlag = false;
+}
+
+int32_t welsCodecTrace::WelsTraceModuleIsExist()
+{
+	return m_WelsTraceExistFlag;
+}
+
+void welsCodecTrace::TraceString(int32_t iLevel, const str_t* str)
+{
+#ifdef WIN32
+	switch(iLevel)
+	{
+	case WELS_LOG_ERROR:
+		if(m_fpErrorTrace)
+			m_fpErrorTrace("%s", str);
+		break;
+	case WELS_LOG_WARNING:
+		if(m_fpWarnTrace)
+			m_fpWarnTrace("%s", str);
+		break;
+	case WELS_LOG_INFO:
+		if(m_fpInfoTrace)
+			m_fpInfoTrace("%s", str);
+		break;
+	case WELS_LOG_DEBUG:
+		if(m_fpDebugTrace)
+			m_fpDebugTrace("%s", str);
+		break;
+	default:
+		if(m_fpDebugTrace)
+			m_fpInfoTrace("%s", str);
+		break;
+	}
+#else
+	switch(iLevel)
+	{
+	case WELS_LOG_ERROR:
+		if(m_fpErrorTrace)
+			m_fpErrorTrace("CODEC", "%s", str);
+		break;
+	case WELS_LOG_WARNING:
+		if(m_fpWarnTrace)
+			m_fpWarnTrace("CODEC", "%s",  str);
+		break;
+	case WELS_LOG_INFO:
+		if(m_fpInfoTrace)
+			m_fpInfoTrace("CODEC", "%s",  str);
+		break;
+	case WELS_LOG_DEBUG:
+		if(m_fpInfoTrace)
+			m_fpInfoTrace("CODEC", "%s",  str);
+		break;
+	default:
+		if(m_fpInfoTrace)
+			m_fpInfoTrace("CODEC", "%s",  str);
+		break;
+	}
+#endif
+}
+
+#define MAX_LOG_SIZE	1024
+
+void welsCodecTrace::CODEC_TRACE(void* ignore, const int32_t iLevel, const str_t* Str_Format, va_list vl)
+{
+//		if(g_traceLevel < iLevel)
+		if ( m_iTraceLevel < iLevel )
+		{
+			return;
+		}
+
+		str_t WStr_Format[MAX_LOG_SIZE] = {0};
+		str_t pBuf[MAX_LOG_SIZE] = {0};
+		str_t cResult[MAX_LOG_SIZE] = {0};
+		const int32_t len	= STRNLEN("[ENCODER]: ", MAX_LOG_SIZE);	// confirmed_safe_unsafe_usage
+		
+		STRNCPY(WStr_Format, MAX_LOG_SIZE, Str_Format, STRNLEN(Str_Format, MAX_LOG_SIZE));	// confirmed_safe_unsafe_usage	
+		
+		STRNCPY(pBuf, MAX_LOG_SIZE, "[ENCODER]: ", len);	// confirmed_safe_unsafe_usage
+#if defined(WIN32)
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500		
+		VSPRINTF(pBuf + len, MAX_LOG_SIZE-len, WStr_Format, vl);	// confirmed_safe_unsafe_usage		
+#else		
+		VSPRINTF(pBuf + len, WStr_Format, vl);	// confirmed_safe_unsafe_usage
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#else//__GNUC__
+		VSPRINTF(pBuf + len, WStr_Format, vl);	// confirmed_safe_unsafe_usage
+#endif//WIN32
+		STRNCPY(cResult, MAX_LOG_SIZE, pBuf, STRNLEN(pBuf, MAX_LOG_SIZE));	// confirmed_safe_unsafe_usage
+
+//		g_WelsCodecTrace.TraceString(iLevel, cResult);		
+		welsCodecTrace::TraceString(iLevel, cResult);
+}
+
+void welsCodecTrace::SetTraceLevel(const int32_t iLevel)
+{
+//	g_traceLevel	= iLevel;
+	if ( iLevel >= 0 )
+		m_iTraceLevel	= iLevel;
+}
+
+
--- /dev/null
+++ b/codec/encoder/plus/src/welsEncoderExt.cpp
@@ -1,0 +1,1255 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <assert.h>
+#include "welsEncoderExt.h"
+#include "welsCodecTrace.h"
+#include "typedefs.h"
+#include "wels_const.h"
+#include "utils.h"
+#include "macros.h"
+
+#include "crt_util_safe_x.h"	// Safe CRT routines like util for cross platforms
+#include "ref_list_mgr_svc.h"
+
+#include <time.h>
+#if defined(WIN32) /*&& defined(_DEBUG)*/
+
+#include <windows.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#endif
+
+namespace WelsSVCEnc {
+
+/*
+ *	CWelsH264SVCEncoder class implementation
+ */
+CWelsH264SVCEncoder::CWelsH264SVCEncoder()
+:	m_pEncContext		( NULL ),
+#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__) 
+	m_pWelsTrace			( NULL ),
+#endif
+	m_pSrcPicList		( NULL ),
+	m_iSrcListSize		( 0 ),
+	m_iMaxPicWidth		( 0 ),
+	m_iMaxPicHeight		( 0 ),
+	m_iCspInternal		( 0 ),
+	m_bInitialFlag		( FALSE )
+{
+#ifdef REC_FRAME_COUNT
+	int32_t m_uiCountFrameNum = 0;
+#endif//REC_FRAME_COUNT
+
+#ifdef OUTPUT_BIT_STREAM
+	str_t strStreamFileName[1024] = { 0 };  //for .264
+	int32_t iBufferUsed = 0;
+	int32_t iBufferLeft = 1023;
+	
+	str_t strLenFileName[1024] = { 0 }; //for .len
+	int32_t iBufferUsedSize = 0;
+	int32_t iBufferLeftSize = 1023;
+#endif//OUTPUT_BIT_STREAM
+
+#ifdef OUTPUT_BIT_STREAM
+	time_t tTime;
+				
+#if defined( WIN32 )
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+	struct tm tTimeNow;
+#else
+	struct tm *tTimeNow;
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+	struct _timeb tTimeb;
+	
+	time(&tTime);
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+	LOCALTIME(&tTimeNow, &tTime);
+#else
+	tTimeNow = LOCALTIME(&tTime);
+	if ( NULL == tTimeNow )
+		return;
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+	FTIME(&tTimeb);
+#elif defined( __GNUC__ )
+	struct tm* tTimeNow;
+	struct timeval tTimev;
+	time(&tTime);
+	tTimeNow = (struct tm *)localtime(&tTime);
+	gettimeofday(&tTimev,NULL);
+#endif//WIN32	
+	
+#ifdef WIN32
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+	iBufferUsed      += SNPRINTF(strStreamFileName,      iBufferLeft, iBufferLeft,      "enc_bs_0x%p_",   (void*)this);
+	iBufferUsedSize += SNPRINTF(strLenFileName, iBufferLeftSize, iBufferLeftSize, "enc_size_0x%p_", (void*)this);
+#else
+	iBufferUsed      += SNPRINTF(strStreamFileName,      iBufferLeft,      "enc_bs_0x%p_",   (void*)this);
+	iBufferUsedSize += SNPRINTF(strLenFileName, iBufferLeftSize, "enc_size_0x%p_", (void*)this);
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#else
+	iBufferUsed      += SNPRINTF(strStreamFileName,      iBufferLeft,      "/tmp/enc_bs_0x%p_",  (void*)this);
+	iBufferUsedSize += SNPRINTF(strLenFileName, iBufferLeftSize, "/tmp/enc_size_0x%p", (void*)this);
+#endif//WIN32
+    
+	
+	iBufferLeft -= iBufferUsed;
+	if ( iBufferLeft > iBufferUsed )
+	{		
+#if defined(_GNUC__)
+		iBufferUsed += strftime(&strStreamFileName[iBufferUsed], iBufferLeft, "%y%m%d%H%M%S", tTimeNow);
+#else		
+#if defined(_MSC_VER)
+		iBufferUsed += strftime(&strStreamFileName[iBufferUsed], iBufferLeft, "%y%m%d%H%M%S", 
+#if _MSC_VER>=1500
+			&tTimeNow
+#else
+			tTimeNow
+#endif//_MSC_VER>=1500
+			);
+#endif//_MSC_VER			
+#endif//__GNUC__
+		iBufferLeft -= iBufferUsed;
+	}
+	
+	iBufferLeftSize -= iBufferUsedSize;
+	if ( iBufferLeftSize> iBufferUsedSize )
+	{		
+#if defined(_GNUC__)
+		iBufferUsedSize += strftime(&strLenFileName[iBufferUsedSize], iBufferLeftSize, "%y%m%d%H%M%S", tTimeNow);
+#else
+#if defined(_MSC_VER)
+		iBufferUsedSize += strftime(&strLenFileName[iBufferUsedSize], iBufferLeftSize, "%y%m%d%H%M%S", 
+#if _MSC_VER>=1500
+			&tTimeNow
+#else
+			tTimeNow
+#endif//_MSC_VER>=1500
+			);
+#endif//_MSC_VER
+#endif//__GNUC__
+		iBufferLeftSize -= iBufferUsedSize;
+	}
+	
+	if ( iBufferLeft > iBufferUsed )
+	{
+#ifdef WIN32
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+		iBufferUsed += SNPRINTF(&strStreamFileName[iBufferUsed], iBufferLeft, iBufferLeft, ".%03.3u.264", tTimeb.millitm);
+#else
+		iBufferUsed += SNPRINTF(&strStreamFileName[iBufferUsed], iBufferLeft, ".%03.3u.264", tTimeb.millitm);
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#else
+		iBufferUsed += SNPRINTF(&strStreamFileName[iBufferUsed], iBufferLeft, ".%03.3u.264", tTimev.tv_usec/1000);
+#endif//WIN32
+		iBufferLeft -= iBufferUsed;
+	}
+	
+	if ( iBufferLeftSize > iBufferUsedSize )
+	{
+#ifdef WIN32
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+		iBufferUsedSize += SNPRINTF(&strLenFileName[iBufferUsedSize], iBufferLeftSize, iBufferLeftSize, ".%03.3u.len", tTimeb.millitm);
+#else
+		iBufferUsedSize += SNPRINTF(&strLenFileName[iBufferUsedSize], iBufferLeftSize, ".%03.3u.len", tTimeb.millitm);
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#else
+		iBufferUsedSize += SNPRINTF(&strLenFileName[iBufferUsedSize], iBufferLeftSize, ".%03.3u.len", tTimev.tv_usec/1000);
+#endif//WIN32
+		iBufferLeftSize -= iBufferUsedSize;
+	}
+
+#if defined(__GNUC__)
+	m_pFileBs       = FOPEN(strStreamFileName,      "wb");
+	m_pFileBsSize	= FOPEN(strLenFileName, "wb");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+	FOPEN(&m_pFileBs, strStreamFileName,      "wb");
+	FOPEN(&m_pFileBsSize, strLenFileName, "wb");
+#else
+	m_pFileBs       = FOPEN(strStreamFileName,      "wb");
+	m_pFileBsSize	= FOPEN(strLenFileName, "wb");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#endif//__GNUC__
+
+	m_bSwitch	= FALSE;
+	m_iSwitchTimes	= 0;
+#endif//OUTPUT_BIT_STREAM
+	
+	InitEncoder();
+}
+
+CWelsH264SVCEncoder::~CWelsH264SVCEncoder()
+{	
+	WelsLog(NULL, WELS_LOG_INFO, "CWelsH264SVCEncoder::~CWelsH264SVCEncoder()\n");
+#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__) 
+
+	if ( m_pWelsTrace != NULL )
+	{
+		delete m_pWelsTrace;
+		m_pWelsTrace = NULL;
+	}
+#endif
+#ifdef REC_FRAME_COUNT
+	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::~CWelsH264SVCEncoder(), m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif
+
+#ifdef REC_FRAME_COUNT
+	m_uiCountFrameNum = 0;
+#endif//REC_FRAME_COUNT
+
+#ifdef OUTPUT_BIT_STREAM
+	if ( m_pFileBs )
+	{
+		fclose( m_pFileBs );
+		m_pFileBs = NULL;
+	}
+	if ( m_pFileBsSize )
+	{
+		fclose( m_pFileBsSize );
+		m_pFileBsSize = NULL;
+	}
+	m_bSwitch	= FALSE;
+	m_iSwitchTimes	= 0;
+#endif//OUTPUT_BIT_STREAM
+
+	Unintialize();
+}
+
+void CWelsH264SVCEncoder::InitEncoder( void )
+{
+#if defined(WIN32)||defined(_MACH_PLATFORM)||defined(__GNUC__) 
+	
+#ifdef REC_FRAME_COUNT
+	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::InitEncoder, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif
+
+	m_pWelsTrace	= new welsCodecTrace();
+	if ( m_pWelsTrace != NULL )
+	{
+		const int32_t iWelsTraceExistingFlag = m_pWelsTrace->WelsTraceModuleIsExist();
+		if ( iWelsTraceExistingFlag )
+		{
+			m_pWelsTrace->SetTraceLevel( WELS_LOG_DEFAULT );
+			WelsSetLogCallback( welsCodecTrace::CODEC_TRACE );
+		}
+	}
+	
+	// initialization	
+	WelsSetLogLevel( WELS_LOG_DEFAULT );	// no output, WELS_LOG_QUIET
+#endif	
+}
+
+/* Interfaces override from ISVCEncoder */
+
+/*
+ *	SVC Encoder Initialization
+ */
+int CWelsH264SVCEncoder::Initialize(SVCEncodingParam* argv, const INIT_TYPE iInitType)
+{
+	if ( INIT_TYPE_PARAMETER_BASED != iInitType || NULL == argv )
+	{
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInitType= %d, argv= 0x%p\n", iInitType, (void *)argv);
+		return cmInitParaError;
+	}
+
+	if ( m_bInitialFlag )
+	{
+		WelsLog(m_pEncContext, WELS_LOG_WARNING, "CWelsH264SVCEncoder::Initialize(), reinitialize, m_bInitialFlag= %d\n", m_bInitialFlag);
+		Unintialize();
+	}	
+	
+	SVCEncodingParam		sEncodingParam;
+	SWelsSvcCodingParam	sConfig( true );
+	
+	memcpy(&sEncodingParam, argv, sizeof(SVCEncodingParam));	// confirmed_safe_unsafe_usage
+
+#ifdef REC_FRAME_COUNT
+	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::Initialize, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+	WelsLog( m_pEncContext, WELS_LOG_INFO, "coding_param->iPicWidth= %d;coding_param->iPicHeight= %d;coding_param->iTargetBitrate= %d;coding_param->iRCMode= %d;coding_param->iTemporalLayerNum= %d;coding_param->iSpatialLayerNum= %d;coding_param->fFrameRate= %.6ff;coding_param->iInputCsp= %d;coding_param->iKeyPicCodingMode= %d;coding_param->uiIntraPeriod= %d;coding_param->bEnableSpsPpsIdAddition = %d;coding_param->bPrefixNalAddingCtrl = %d;coding_param->bEnableDenoise= %d;coding_param->bEnableBackgroundDetection= %d;coding_param->bEnableAdaptiveQuant= %d;coding_param->bEnableCropPic= %d;coding_param->bEnableLongTermReference= %d;coding_param->iLtrMarkPeriod= %d;\n",
+		sEncodingParam.iPicWidth,
+		sEncodingParam.iPicHeight,
+		sEncodingParam.iTargetBitrate,
+		sEncodingParam.iRCMode,
+		sEncodingParam.iTemporalLayerNum,
+		sEncodingParam.iSpatialLayerNum,
+		sEncodingParam.fFrameRate,
+		sEncodingParam.iInputCsp,
+		sEncodingParam.iKeyPicCodingMode,
+		sEncodingParam.iIntraPeriod,
+		sEncodingParam.bEnableSpsPpsIdAddition,
+		sEncodingParam.bPrefixNalAddingCtrl,
+		sEncodingParam.bEnableDenoise,
+		sEncodingParam.bEnableBackgroundDetection,
+		sEncodingParam.bEnableAdaptiveQuant,
+		sEncodingParam.bEnableCropPic,
+		sEncodingParam.bEnableLongTermReference,
+		sEncodingParam.iLtrMarkPeriod);
+	int32_t i = 0;
+	while (i < sEncodingParam.iSpatialLayerNum) {
+		SSpatialLayerConfig *spatial_cfg = &sEncodingParam.sSpatialLayers[i];
+		WelsLog( m_pEncContext, WELS_LOG_INFO, "coding_param->sSpatialLayers[%d]: .iVideoWidth= %d; .iVideoHeight= %d; .fFrameRate= %.6ff; .iQualityLayerNum= %d; .iSpatialBitrate= %d; .iCgsSnrRefined= %d; .iInterSpatialLayerPredFlag= %d; .sSliceCfg.uiSliceMode= %d; .sSliceCfg.sSliceArgument.uiSliceNum= %d; .sSliceCfg.sSliceArgument.uiSliceSizeConstraint= %d;\n",
+			i, spatial_cfg->iVideoWidth,
+			spatial_cfg->iVideoHeight,
+			spatial_cfg->fFrameRate,
+			spatial_cfg->iQualityLayerNum,
+			spatial_cfg->iSpatialBitrate,
+			spatial_cfg->iCgsSnrRefined,
+			spatial_cfg->iInterSpatialLayerPredFlag,
+			spatial_cfg->sSliceCfg.uiSliceMode,			
+			spatial_cfg->sSliceCfg.sSliceArgument.uiSliceNum,
+			spatial_cfg->sSliceCfg.sSliceArgument.uiSliceSizeConstraint
+			);
+		++ i;
+	}
+#endif//REC_FRAME_COUNT
+
+	// Convert SVCEncodingParam into WelsSVCParamConfig here..	
+	if ( sConfig.ParamTranscode( sEncodingParam, true ) ){
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), parameter_translation failed.\n");
+		Unintialize();
+		return cmInitParaError;
+	}
+
+	m_iSrcListSize  = 1;
+
+	return Initialize((void *)&sConfig, INIT_TYPE_CONFIG_BASED);
+}
+
+int CWelsH264SVCEncoder::Initialize(void * argv, const INIT_TYPE iInitType)
+{
+	if ( INIT_TYPE_CONFIG_BASED != iInitType || NULL == argv )
+	{
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInitType= %d, argv= 0x%p.\n", iInitType, (void *)argv);
+		return cmInitParaError;
+	}
+
+	if ( m_bInitialFlag )
+	{
+		WelsLog(m_pEncContext, WELS_LOG_WARNING, "CWelsH264SVCEncoder::Initialize(), reinitialize, m_bInitialFlag= %d.\n", m_bInitialFlag);
+		Unintialize();
+	}
+
+	SWelsSvcCodingParam  *pCfg = static_cast<SWelsSvcCodingParam*>(argv);		
+
+	const int32_t iColorspace = pCfg->iInputCsp;
+	if ( 0 == iColorspace )
+	{		
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iInputCsp= %d.\n", iColorspace);
+		Unintialize();
+		return cmInitParaError;
+	}		
+
+	// Check valid parameters
+	const int32_t iNumOfLayers = pCfg->iNumDependencyLayer;
+	if ( iNumOfLayers < 1 || iNumOfLayers > MAX_DEPENDENCY_LAYER ){
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iNumDependencyLayer= %d, valid at range of [1, %d].\n", iNumOfLayers, MAX_DEPENDENCY_LAYER);
+		Unintialize();
+		return cmInitParaError;
+	}
+	if ( pCfg->iNumTemporalLayer < 1 )
+		pCfg->iNumTemporalLayer	= 1;
+	if ( pCfg->iNumTemporalLayer > MAX_TEMPORAL_LEVEL ){
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid iNumTemporalLayer= %d, valid at range of [1, %d].\n", pCfg->iNumTemporalLayer, MAX_TEMPORAL_LEVEL);
+		Unintialize();
+		return cmInitParaError;
+	}
+
+	//	assert( cfg.uiGopSize >= 1 && ( cfg.uiIntraPeriod && (cfg.uiIntraPeriod % cfg.uiGopSize) == 0) );
+
+	if ( pCfg->uiGopSize < 1 || pCfg->uiGopSize > MAX_GOP_SIZE ){
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid uiGopSize= %d, valid at range of [1, %d].\n", pCfg->uiGopSize, MAX_GOP_SIZE);
+		Unintialize();
+		return cmInitParaError;
+	}
+
+	if ( !WELS_POWER2_IF(pCfg->uiGopSize) ){
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid uiGopSize= %d, valid at range of [1, %d] and yield to power of 2.\n", pCfg->uiGopSize, MAX_GOP_SIZE);
+		Unintialize();
+		return cmInitParaError;
+	}
+
+	if ( pCfg->uiIntraPeriod && pCfg->uiIntraPeriod < pCfg->uiGopSize )			
+	{		
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid uiIntraPeriod= %d, valid in case it equals to 0 for unlimited intra period or exceeds specified uiGopSize= %d.\n", pCfg->uiIntraPeriod, pCfg->uiGopSize);
+		Unintialize();
+		return cmInitParaError;
+	}
+
+	if ( ( pCfg->uiIntraPeriod && (pCfg->uiIntraPeriod & (pCfg->uiGopSize-1)) != 0) )
+	{
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), invalid uiIntraPeriod= %d, valid in case it equals to 0 for unlimited intra period or exceeds specified uiGopSize= %d also multiple of it.\n", pCfg->uiIntraPeriod, pCfg->uiGopSize);
+		Unintialize();
+		return cmInitParaError;
+	}
+
+	// Fine tune num_ref_num
+	if (pCfg->bEnableLongTermReference){
+		pCfg->iLTRRefNum = LONG_TERM_REF_NUM;
+	}else{
+		pCfg->iLTRRefNum = 0;
+	}
+	pCfg->iNumRefFrame = ((pCfg->uiGopSize>>1)>1)?((pCfg->uiGopSize>>1)+pCfg->iLTRRefNum):(MIN_REF_PIC_COUNT+pCfg->iLTRRefNum);
+
+	pCfg->iNumRefFrame = WELS_CLIP3(pCfg->iNumRefFrame, MIN_REF_PIC_COUNT, MAX_REFERENCE_PICTURE_COUNT_NUM);
+
+	if (pCfg->uiLtrMarkPeriod == 0)
+	{
+		pCfg->uiLtrMarkPeriod = 30;
+	}
+
+	const int32_t kiDecStages = WELS_LOG2( pCfg->uiGopSize );
+	pCfg->iInputCsp			= iColorspace;	
+	pCfg->iNumTemporalLayer	= (int8_t)(1 + kiDecStages);
+	pCfg->iLoopFilterAlphaC0Offset	= WELS_CLIP3( pCfg->iLoopFilterAlphaC0Offset, -6, 6 );
+	pCfg->iLoopFilterBetaOffset		= WELS_CLIP3( pCfg->iLoopFilterBetaOffset, -6, 6 );
+
+//	m_pSrcPicList	= (SSourcePicture **)WelsMalloc( pCfg->iNumDependencyLayer * sizeof(SSourcePicture *), "m_pSrcPicList" );
+	// prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
+	m_pSrcPicList	= new SSourcePicture* [iNumOfLayers];
+
+	if ( NULL == m_pSrcPicList ){
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), pOut of memory due m_pSrcPicList memory request.\n");
+		Unintialize();
+		return cmMallocMemeError;
+	}
+
+	// decide property list size between INIT_TYPE_PARAMETER_BASED/INIT_TYPE_CONFIG_BASED 
+    m_iMaxPicWidth	= pCfg->iActualPicWidth;
+	m_iMaxPicHeight	= pCfg->iActualPicHeight;	
+	m_iSrcListSize  = iNumOfLayers;	
+
+	for (int32_t i = 0; i < m_iSrcListSize; ++ i)
+	{
+//		m_pSrcPicList[i]	= (SSourcePicture *)WelsMalloc( sizeof(SSourcePicture), "m_pSrcPicList[]" );
+		// prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
+		m_pSrcPicList[i]	= new SSourcePicture;
+
+		if ( NULL == m_pSrcPicList[i] )
+		{			
+			WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), pOut of memory due m_pSrcPicList[%d] memory request.\n", i);
+			Unintialize();
+			m_iSrcListSize = 0;
+			return cmMallocMemeError;
+		}
+		InitPic( m_pSrcPicList[i], iColorspace, m_iMaxPicWidth, m_iMaxPicHeight );
+	}	
+
+#if defined(OUTPUT_BIT_STREAM) || defined(ENABLE_TRACE_FILE)
+	str_t fpath[MAX_FNAME_LEN] = {0};
+#if defined(__GNUC__)
+	SNPRINTF(fpath, MAX_FNAME_LEN, "/tmp/");		// confirmed_safe_unsafe_usage
+
+#else//__GNUC__
+
+#if defined (_MSC_VER)
+#if _MSC_VER>=1500
+	SNPRINTF(fpath, MAX_FNAME_LEN, MAX_FNAME_LEN, ".\\" );		// confirmed_safe_unsafe_usage
+#else
+	SNPRINTF(fpath, MAX_FNAME_LEN, ".\\" );		// confirmed_safe_unsafe_usage
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+#endif //__GNUC__
+
+	strcpy(pCfg->sTracePath, fpath);		// confirmed_safe_unsafe_usage
+
+#endif //#if defined(OUTPUT_BIT_STREAM) || defined(ENABLE_TRACE_FILE)
+
+	if ( WelsInitEncoderExt( &m_pEncContext, pCfg ) )
+	{		
+		WelsLog(m_pEncContext, WELS_LOG_ERROR, "CWelsH264SVCEncoder::Initialize(), WelsInitEncoderExt failed.\n");
+		Unintialize();
+		return cmInitParaError;
+	}  
+
+	m_iCspInternal	= iColorspace;
+	m_bInitialFlag  = TRUE;
+
+	return cmResultSuccess;
+}
+
+/*
+ *	SVC Encoder Uninitialization
+ */
+int32_t CWelsH264SVCEncoder::Unintialize()
+{
+	if ( !m_bInitialFlag )
+	{
+		return 0;
+	}
+
+	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::Unintialize()..\n" );
+
+#ifdef REC_FRAME_COUNT
+	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::Unintialize, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+
+	if ( NULL != m_pEncContext )
+	{
+		if ( NULL != m_pSrcPicList )
+		{
+			for (int32_t i = 0; i < m_iSrcListSize; i++)
+			{
+				SSourcePicture *pic = m_pSrcPicList[i];
+				if ( NULL != pic )
+				{
+//					WelsFree( pic, "m_pSrcPicList[]" );
+					// prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
+					delete pic;
+
+					pic = NULL;
+				}
+			}
+//			WelsFree( m_pSrcPicList, "m_pSrcPicList" );
+			// prefer use new/delete pair due encoder intialization stage not start yet for CacheLineSize not detection here (16 or 64 not matched)
+			delete [] m_pSrcPicList;
+
+			m_pSrcPicList = NULL;
+			m_iSrcListSize= 0;
+		}
+		
+		WelsUninitEncoderExt( &m_pEncContext );
+		m_pEncContext	= NULL;
+	}	
+
+	m_bInitialFlag = FALSE;
+
+	return 0;
+}
+	
+
+int32_t CWelsH264SVCEncoder::RawData2SrcPic(const uint8_t * pSrc)
+{    
+	assert( m_iSrcListSize > 0 );
+
+	int32_t y_length = m_iMaxPicWidth * m_iMaxPicHeight;
+	m_pSrcPicList[0]->pData[0] = const_cast<uint8_t*>(pSrc);
+
+	switch(m_iCspInternal & (~videoFormatVFlip))
+	{
+		case videoFormatYVYU:
+		case videoFormatUYVY:
+		case videoFormatYUY2:		
+		case videoFormatRGB:
+		case videoFormatBGR:			
+		case videoFormatBGRA:
+		case videoFormatRGBA:
+		case videoFormatARGB:
+		case videoFormatABGR:
+			m_pSrcPicList[0]->pData[1] = m_pSrcPicList[0]->pData[2] = NULL;		
+			break;
+		case videoFormatI420:
+		case videoFormatYV12:
+			m_pSrcPicList[0]->pData[1] = m_pSrcPicList[0]->pData[0] + y_length;
+			m_pSrcPicList[0]->pData[2] = m_pSrcPicList[0]->pData[1] + ( y_length >> 2 );
+			break;			
+		default:
+			return 1;		
+	}
+
+    return 0;
+}
+
+
+/*
+ *	SVC core encoding
+ */
+int CWelsH264SVCEncoder::EncodeFrame(const unsigned char* pSrc, SFrameBSInfo* pBsInfo)
+{
+	if ( !(pSrc && m_pEncContext && m_bInitialFlag) )
+	{
+		return videoFrameTypeInvalid;
+	}
+
+	int32_t uiFrameType = videoFrameTypeInvalid;		
+
+	if( RawData2SrcPic((uint8_t *)pSrc) == 0 ){
+		uiFrameType = EncodeFrame(const_cast<const SSourcePicture**>(m_pSrcPicList), 1, pBsInfo);
+	}
+
+#ifdef REC_FRAME_COUNT
+	++ m_uiCountFrameNum;
+	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::EncodeFrame(), m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT		
+
+#ifdef DUMP_SRC_PICTURE
+    DumpSrcPicture(pSrc);
+#endif // DUMP_SRC_PICTURE	
+
+	return uiFrameType;
+}
+
+
+int CWelsH264SVCEncoder::EncodeFrame(const SSourcePicture  ** pSrcPicList, int nSrcPicNum, SFrameBSInfo * pBsInfo)
+{
+	if ( !(pSrcPicList && m_pEncContext && m_bInitialFlag) )
+	{		
+		return videoFrameTypeInvalid;
+	}
+
+	int32_t iFrameTypeReturned = 0;
+	int32_t iFrameType = videoFrameTypeInvalid;
+	
+	if (nSrcPicNum > 0)
+	{
+		iFrameTypeReturned = WelsEncoderEncodeExt( m_pEncContext, pBsInfo, pSrcPicList, nSrcPicNum);			
+	}
+	else
+	{
+		assert ( 0 );
+		return videoFrameTypeInvalid;
+	}
+
+	switch( iFrameTypeReturned )
+	{
+	case WELS_FRAME_TYPE_P:
+		iFrameType	= videoFrameTypeP;
+		break;
+	case WELS_FRAME_TYPE_IDR:
+		iFrameType	= videoFrameTypeIDR;
+		break;
+	case WELS_FRAME_TYPE_SKIP:
+		iFrameType	= videoFrameTypeSkip;
+		break;
+	case WELS_FRAME_TYPE_I:
+		iFrameType	= videoFrameTypeI;
+		break;
+	case WELS_FRAME_TYPE_AUTO:
+	case WELS_FRAME_TYPE_B: // not support B pictures
+		iFrameType	= videoFrameTypeInvalid;
+		break;
+	default:
+		break;
+	}	
+
+
+
+	///////////////////for test
+#ifdef OUTPUT_BIT_STREAM
+	if ( iFrameType != videoFrameTypeInvalid && iFrameType != videoFrameTypeSkip )
+	{		
+		SLayerBSInfo* pLayer = NULL;
+		int32_t i = 0, j = 0, iCurLayerBits = 0, total_bits = 0;		
+
+		if ( m_bSwitch )
+		{
+			if ( m_pFileBs )
+			{
+				fclose( m_pFileBs );
+				m_pFileBs = NULL;
+			}
+			if ( m_pFileBsSize )
+			{
+				fclose( m_pFileBsSize );
+				m_pFileBsSize = NULL;
+			}
+			str_t strStreamFileName[128] = {0};
+#if defined(__GNUC__)
+
+			int32_t iLen = SNPRINTF(strStreamFileName, 128, "%sadj%d_w%d.264", m_pEncContext->sTracePath,  m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
+			m_pFileBs = FOPEN( strStreamFileName, "wb" );
+			SNPRINTF(strStreamFileName, 128, "%sadj%d_w%d_size.iLen", m_pEncContext->sTracePath, m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
+			m_pFileBsSize = FOPEN( strStreamFileName, "wb");
+
+#else//__GNUC__
+            
+#if defined (_MSC_VER)
+#if _MSC_VER>=1500
+			int32_t iLen = SNPRINTF(strStreamFileName, 128, 128, "adj%d_w%d.264", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
+			FOPEN( &m_pFileBs, strStreamFileName, "wb" );
+			SNPRINTF(strStreamFileName, 128, 128, "adj%d_w%d_size.iLen", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
+			FOPEN( &m_pFileBsSize, strStreamFileName, "wb");
+#else
+			int32_t iLen = SNPRINTF(strStreamFileName, 128, "adj%d_w%d.264", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
+			m_pFileBs = FOPEN( strStreamFileName, "wb" );
+			SNPRINTF(strStreamFileName, 128, "adj%d_w%d_size.iLen", m_iSwitchTimes, m_pEncContext->pSvcParam->iActualPicWidth);
+			m_pFileBsSize = FOPEN( strStreamFileName, "wb");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER
+
+
+#endif//__GNUC__
+
+			m_bSwitch = FALSE;
+		}
+
+		for ( i = 0; i < pBsInfo->iLayerNum; i++ )
+		{
+			pLayer = &pBsInfo->sLayerInfo[i];
+
+			iCurLayerBits = 0;
+			for ( j = 0; j < pLayer->iNalCount; j++ )
+			{
+				iCurLayerBits += pLayer->iNalLengthInByte[j];
+			}
+			total_bits += iCurLayerBits;
+			if ( m_pFileBs != NULL )
+				fwrite( pLayer->pBsBuf, 1, iCurLayerBits, m_pFileBs );
+		}
+
+		if ( m_pFileBsSize != NULL )
+			fwrite( &total_bits, sizeof(int32_t), 1, m_pFileBsSize );
+	}
+#endif //OUTPUT_BIT_STREAM
+#ifdef DUMP_SRC_PICTURE
+	DumpSrcPicture(pSrcPicList[0]->pData[0]);
+#endif // DUMP_SRC_PICTURE	
+
+	return iFrameType;
+
+}
+
+/*
+ * return: 0 - success; otherwise - failed;
+ */
+int CWelsH264SVCEncoder::PauseFrame(const unsigned char* kpSrc, SFrameBSInfo* pBsInfo)
+{
+    int32_t  iReturn = 1;
+	
+	ForceIntraFrame(true);
+
+	if( EncodeFrame(kpSrc, pBsInfo) != videoFrameTypeInvalid ){
+		iReturn = 0;
+	}
+
+    // to avoid pause frame bitstream and 
+    // normal bitstream use different video channel. 
+	ForceIntraFrame(true);  
+
+	return (int)iReturn;
+}
+
+
+/*
+ *	Force key frame
+ */
+int CWelsH264SVCEncoder::ForceIntraFrame(bool bIDR)
+{
+	if ( !(m_pEncContext && m_bInitialFlag) )
+	{
+		return 1;
+	}
+
+#ifdef REC_FRAME_COUNT
+	WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::ForceIntraFrame(), bIDR= %d, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", bIDR, m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+
+	ForceCodingIDR( m_pEncContext );	
+	
+	return 0;
+}
+
+/************************************************************************
+* InDataFormat, IDRInterval, SVC Encode Param, Frame Rate, Bitrate,..
+************************************************************************/
+int CWelsH264SVCEncoder::SetOption(ENCODER_OPTION eOptionId, void* pOption)
+{
+	if ( NULL == pOption ){		
+		return cmInitParaError;
+	}
+
+	if ( NULL == m_pEncContext || FALSE == m_bInitialFlag ){		
+		return cmInitExpected;
+	}
+
+	switch( eOptionId ) {
+	case ENCODER_OPTION_INTER_SPATIAL_PRED:	// Inter spatial layer prediction flag
+		{
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_INTER_SPATIAL_PRED, this feature not supported at present.\n" );
+		}
+		break;
+	case ENCODER_OPTION_DATAFORMAT:	// Input color space
+		{
+			int32_t iValue = *((int32_t*)pOption);
+			int32_t iColorspace = iValue;
+			if ( iColorspace == 0 ){				
+				return cmInitParaError;
+			}
+			
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n", m_uiCountFrameNum, m_iCspInternal, iValue );
+#endif//REC_FRAME_COUNT
+
+			
+			int32_t iPicIdx = m_iSrcListSize -1;
+			while ( iPicIdx >= 0 )
+			{
+				if ( m_pSrcPicList[iPicIdx] == NULL )
+				{
+					-- iPicIdx;
+					if (iPicIdx < 0) return cmInitParaError;
+					continue;
+				}
+
+				if ( m_pSrcPicList[iPicIdx]->iColorFormat == iColorspace )
+				{					
+					-- iPicIdx;
+					continue;
+				}
+
+				InitPic( m_pSrcPicList[iPicIdx], iColorspace, m_iMaxPicWidth, m_iMaxPicHeight );
+			}				
+			m_iCspInternal = iColorspace;
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+		}		
+		break;
+	case ENCODER_OPTION_IDR_INTERVAL:	// IDR Interval
+		{
+			int32_t iValue	= *((int32_t*)pOption);
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_IDR_INTERVAL, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n", m_uiCountFrameNum, m_iCspInternal, iValue );
+#endif//REC_FRAME_COUNT
+
+			if ( iValue < -1 || iValue == 0 )
+				iValue = 1;
+			if ( iValue == (int32_t)m_pEncContext->pSvcParam->uiIntraPeriod ){				
+				return cmResultSuccess;
+			}
+
+			
+			m_pEncContext->pSvcParam->uiIntraPeriod	= (uint32_t)iValue;			
+		}
+		break;
+	case ENCODER_OPTION_SVC_ENCODE_PARAM:	// SVC Encoding Parameter
+		{
+			SVCEncodingParam		sEncodingParam;
+			SWelsSvcCodingParam	sConfig( true );
+			int32_t iInputColorspace = 0;
+			int32_t iTargetWidth = 0;
+			int32_t iTargetHeight= 0;
+
+			memcpy(&sEncodingParam, pOption, sizeof(SVCEncodingParam));	// confirmed_safe_unsafe_usage
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_SVC_ENCODE_PARAM, sEncodingParam.iInputCsp= 0x%x\n", sEncodingParam.iInputCsp );
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "coding_param->iPicWidth= %d;coding_param->iPicHeight= %d;coding_param->iTargetBitrate= %d;coding_param->iRCMode= %d;coding_param->iPaddingFlag= %d;coding_param->iTemporalLayerNum= %d;coding_param->iSpatialLayerNum= %d;coding_param->fFrameRate= %.6ff;coding_param->iInputCsp= %d;coding_param->iKeyPicCodingMode= %d;coding_param->uiIntraPeriod= %d;coding_param->bEnableSpsPpsIdAddition = %d;coding_param->bPrefixNalAddingCtrl = %d;coding_param->bEnableDenoise= %d;coding_param->bEnableBackgroundDetection= %d;coding_param->bEnableAdaptiveQuant= %d;coding_param->bEnableCropPic= %d;coding_param->bEnableLongTermReference= %d;coding_param->iLtrMarkPeriod= %d;\n",
+				sEncodingParam.iPicWidth,
+				sEncodingParam.iPicHeight,
+				sEncodingParam.iTargetBitrate,
+				sEncodingParam.iRCMode,
+				sEncodingParam.iPaddingFlag,
+				sEncodingParam.iTemporalLayerNum,
+				sEncodingParam.iSpatialLayerNum,
+				sEncodingParam.fFrameRate,
+				sEncodingParam.iInputCsp,
+				sEncodingParam.iKeyPicCodingMode,
+				sEncodingParam.iIntraPeriod,
+				sEncodingParam.bEnableSpsPpsIdAddition,
+				sEncodingParam.bPrefixNalAddingCtrl,
+				sEncodingParam.bEnableDenoise,
+				sEncodingParam.bEnableBackgroundDetection,
+				sEncodingParam.bEnableAdaptiveQuant,
+				sEncodingParam.bEnableCropPic,
+				sEncodingParam.bEnableLongTermReference,
+				sEncodingParam.iLtrMarkPeriod);
+			int32_t i = 0;
+			while (i < sEncodingParam.iSpatialLayerNum)
+			{
+				SSpatialLayerConfig *pSpatialCfg = &sEncodingParam.sSpatialLayers[i]; 
+				WelsLog( m_pEncContext, WELS_LOG_INFO, "coding_param->sSpatialLayers[%d]: .iVideoWidth= %d; .iVideoHeight= %d; .fFrameRate= %.6ff; .iQualityLayerNum= %d; .iSpatialBitrate= %d; .iCgsSnrRefined= %d; .iInterSpatialLayerPredFlag= %d; .sSliceCfg.uiSliceMode= %d; .sSliceCfg.sSliceArgument.iSliceNum= %d; .sSliceCfg.sSliceArgument.uiSliceSizeConstraint= %d;\n",
+					i, pSpatialCfg->iVideoWidth,
+					pSpatialCfg->iVideoHeight,
+					pSpatialCfg->fFrameRate,
+					pSpatialCfg->iQualityLayerNum,
+					pSpatialCfg->iSpatialBitrate,
+					pSpatialCfg->iCgsSnrRefined,
+					pSpatialCfg->iInterSpatialLayerPredFlag,
+					pSpatialCfg->sSliceCfg.uiSliceMode,					
+					pSpatialCfg->sSliceCfg.sSliceArgument.uiSliceNum,
+					pSpatialCfg->sSliceCfg.sSliceArgument.uiSliceSizeConstraint
+					);
+				++ i;
+			}
+#ifdef OUTPUT_BIT_STREAM
+			if ( sEncodingParam.sSpatialLayers[sEncodingParam.iSpatialLayerNum-1].iVideoWidth != m_pEncContext->pSvcParam->sDependencyLayers[m_pEncContext->pSvcParam->iNumDependencyLayer-1].iFrameWidth )
+			{
+				++ m_iSwitchTimes;
+				m_bSwitch = TRUE;
+			}
+#endif//OUTPUT_BIT_STREAM
+			if ( sEncodingParam.iSpatialLayerNum < 1 || sEncodingParam.iSpatialLayerNum > MAX_SPATIAL_LAYER_NUM )	// verify number of spatial layer
+			{					
+				return cmInitParaError;
+			}
+
+			iInputColorspace	= sEncodingParam.iInputCsp;			
+			if ( sConfig.ParamTranscode( sEncodingParam, true ) )
+			{					
+				return cmInitParaError;
+			}
+			if ( sConfig.iNumDependencyLayer < 1 )
+			{					
+				return cmInitParaError;
+			}
+			iTargetWidth	= sConfig.iActualPicWidth;
+			iTargetHeight	= sConfig.iActualPicHeight;				
+			if ( m_pSrcPicList[0] == NULL )
+			{					
+				return cmInitParaError;
+			}
+			if ( m_iCspInternal != iInputColorspace || m_iMaxPicWidth != iTargetWidth || m_iMaxPicHeight != iTargetHeight ){	// for color space due to changed
+				InitPic( m_pSrcPicList[0], iInputColorspace, iTargetWidth, iTargetHeight );
+				m_iMaxPicWidth	= iTargetWidth;
+				m_iMaxPicHeight	= iTargetHeight;
+				m_iCspInternal	= iInputColorspace;
+			}			
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_SVC_ENCODE_PARAM, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+
+			/* New configuration available here */
+			sConfig.iInputCsp	= m_iCspInternal;	// I420 in default designed for presentation in encoder used internal
+			sConfig.DetermineTemporalSettings();
+
+			/* Check every field whether there is new request for memory block changed or else, Oct. 24, 2008 */
+			WelsEncoderParamAdjust( &m_pEncContext, &sConfig );	
+		}
+		break;
+	case ENCODER_OPTION_FRAME_RATE:	// Maximal input frame rate
+		{
+			float iValue	= *((float*)pOption);
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_FRAME_RATE, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n", m_uiCountFrameNum, m_iCspInternal, iValue );
+#endif//REC_FRAME_COUNT
+			m_pEncContext->pSvcParam->fMaxFrameRate	= iValue;			
+			
+		}
+		break;
+	case ENCODER_OPTION_iBitRate:	// Target bit-rate
+		{
+			int32_t iValue = *((int32_t*)pOption);
+#ifdef REC_FRAME_COUNT
+				WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::SetOption():ENCODER_OPTION_iBitRate, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x, iValue= %d\n", m_uiCountFrameNum, m_iCspInternal, iValue );
+#endif//REC_FRAME_COUNT
+				m_pEncContext->pSvcParam->iTargetBitrate	= iValue;				
+
+		}
+		break;
+	case ENCODER_OPTION_RC_MODE:	// 0:quality mode;1:bit-rate mode
+		{
+			int32_t iValue = *((int32_t*)pOption);				
+			m_pEncContext->pSvcParam->iRCMode	= iValue;						
+		}
+		break;
+	case ENCODER_PADDING_PADDING:	// 0:disable padding;1:padding
+		{
+			int32_t iValue = *((int32_t*)pOption);				
+			m_pEncContext->pSvcParam->iPaddingFlag	= iValue;				
+		}
+		break;
+	case ENCODER_LTR_RECOVERY_REQUEST:
+		{
+			SLTRRecoverRequest* pLTR_Recover_Request = (SLTRRecoverRequest*)(pOption);
+			FilterLTRRecoveryRequest(m_pEncContext,pLTR_Recover_Request);
+		}
+		break;
+	case ENCODER_LTR_MARKING_FEEDBACK:
+		{
+			SLTRMarkingFeedback* fb = (SLTRMarkingFeedback*)(pOption);
+			FilterLTRMarkingFeedback(m_pEncContext,fb);
+		}
+		break;
+	case ENCOCER_LTR_MARKING_PERIOD:
+		{
+			uint32_t iValue = *((uint32_t*)(pOption));
+			m_pEncContext->pSvcParam->uiLtrMarkPeriod = iValue;
+		}
+		break;
+	case ENCODER_OPTION_LTR:
+		{		
+			uint32_t iValue = *((uint32_t*)(pOption));
+			m_pEncContext->pSvcParam->bEnableLongTermReference = iValue?true:false;
+			WelsLog(m_pEncContext,WELS_LOG_WARNING," CWelsH264SVCEncoder::SetOption enable LTR = %d",m_pEncContext->pSvcParam->bEnableLongTermReference);
+		}
+		break;
+	case ENCODER_OPTION_ENABLE_SSEI:
+		{
+			bool_t iValue = *((bool_t*)pOption);
+			m_pEncContext->pSvcParam->bEnableSSEI = iValue;
+			WelsLog( m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption enable SSEI = %d \n", m_pEncContext->pSvcParam->bEnableSSEI );
+		}
+		break;
+	case ENCODER_OPTION_ENABLE_PREFIX_NAL_ADDING:
+		{
+			bool_t iValue = *((bool_t*)pOption);
+			m_pEncContext->pSvcParam->bPrefixNalAddingCtrl = iValue;
+			WelsLog( m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption bPrefixNalAddingCtrl = %d \n", m_pEncContext->pSvcParam->bPrefixNalAddingCtrl );		
+		}
+		break;
+	case ENCODER_OPTION_ENABLE_SPS_PPS_ID_ADDITION:
+		{
+			bool_t iValue = *((bool_t*)pOption);
+			
+			m_pEncContext->pSvcParam->bEnableSpsPpsIdAddition = iValue;
+			WelsLog( m_pEncContext, WELS_LOG_INFO, " CWelsH264SVCEncoder::SetOption enable SPS/PPS ID = %d \n", m_pEncContext->pSvcParam->bEnableSpsPpsIdAddition );		
+		}
+		break;
+	case ENCODER_OPTION_CURRENT_PATH:
+		{
+			if (m_pEncContext->pSvcParam != NULL)
+			{
+				str_t * path = static_cast<str_t *>(pOption);
+				m_pEncContext->pSvcParam->pCurPath = path;				
+			}			
+		}
+		break;
+	default:		
+		return cmInitParaError;
+	}
+
+	return 0;
+}
+
+int CWelsH264SVCEncoder::GetOption(ENCODER_OPTION eOptionId, void* pOption)
+{
+	if ( NULL == pOption ){		
+		return cmInitParaError;
+	}	
+	if ( NULL == m_pEncContext || FALSE == m_bInitialFlag ){		
+		return cmInitExpected;
+	}
+	
+	switch( eOptionId ) {
+	case ENCODER_OPTION_INTER_SPATIAL_PRED:	// Inter spatial layer prediction flag
+		{
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "ENCODER_OPTION_INTER_SPATIAL_PRED, this feature not supported at present.\n" );
+		}
+		break;
+	case ENCODER_OPTION_DATAFORMAT:	// Input color space
+		{
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_DATAFORMAT, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+			
+			*((int32_t*)pOption)	= m_iCspInternal;
+		}
+		break;
+	case ENCODER_OPTION_IDR_INTERVAL:	// IDR Interval
+		{
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_IDR_INTERVAL, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+			*((int32_t*)pOption) = m_pEncContext->pSvcParam->uiIntraPeriod;
+		}
+		break;
+	case ENCODER_OPTION_SVC_ENCODE_PARAM:	// SVC Encoding Parameter
+		{
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_SVC_ENCODE_PARAM, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+			memcpy( pOption, m_pEncContext->pSvcParam, sizeof(SWelsSvcCodingParam) );	// confirmed_safe_unsafe_usage
+		}
+		break;
+	case ENCODER_OPTION_FRAME_RATE:	// Maximal input frame rate
+		{
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_FRAME_RATE, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+			*((float*)pOption)	= m_pEncContext->pSvcParam->fMaxFrameRate;
+		}
+		break;
+	case ENCODER_OPTION_iBitRate:	// Target bit-rate
+		{
+#ifdef REC_FRAME_COUNT
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "CWelsH264SVCEncoder::GetOption():ENCODER_OPTION_iBitRate, m_uiCountFrameNum= %d, m_iCspInternal= 0x%x\n", m_uiCountFrameNum, m_iCspInternal );
+#endif//REC_FRAME_COUNT
+			*((int32_t*)pOption)	= m_pEncContext->pSvcParam->iTargetBitrate;
+		}
+		break;
+	default:		
+		return cmInitParaError;
+	}
+	
+	return 0;
+}
+
+void CWelsH264SVCEncoder::DumpSrcPicture(const uint8_t *pSrc)
+{
+#ifdef DUMP_SRC_PICTURE
+	FILE *pFile = NULL;
+	str_t strFileName[256] = {0};
+	const int32_t iDataLength = m_iMaxPicWidth * m_iMaxPicHeight;
+
+#if defined(__GNUC__)
+	STRNCPY(strFileName, 256, "/tmp/pic_in_", STRNLEN("/tmp/pic_in_", 255));	// confirmed_safe_unsafe_usage
+#else
+	STRNCPY(strFileName, 256, "d:\\incoming\\mosaic_st\\pic_in_", STRNLEN("d:\\incoming\\mosaic_st\\pic_in_", 255));	// confirmed_safe_unsafe_usage
+#endif//__GNUC__
+    
+	if ( m_iMaxPicWidth == 640 )
+	{
+		STRCAT(strFileName, 256, "360p.");	// confirmed_safe_unsafe_usage
+	}
+	else if ( m_iMaxPicWidth == 320  )
+	{
+		STRCAT(strFileName, 256, "180p.");	// confirmed_safe_unsafe_usage
+	}
+	else if ( m_iMaxPicWidth == 160 )
+	{
+		STRCAT(strFileName, 256, "90p.");	// confirmed_safe_unsafe_usage
+	}		
+    
+	switch( m_iCspInternal) {
+		case videoFormatI420:
+		case videoFormatYV12:
+			STRCAT(strFileName, 256, "yuv");	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__)
+			pFile = FOPEN(strFileName, "ab+");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+			FOPEN(&pFile, strFileName, "ab+");
+#else
+			pFile = FOPEN(strFileName, "ab+");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER			
+#endif//__GNUC__
+			//				WelsLog( m_pEncContext, WELS_LOG_INFO, "WELS_CSP_I420, m_iCspInternal= 0x%x\n", m_iCspInternal);
+			if (NULL != pFile)
+			{			
+				fwrite( pSrc, sizeof(uint8_t), (iDataLength * 3)>>1, pFile );
+				fflush( pFile );
+				fclose(pFile);
+			}
+			break;
+		case videoFormatRGB:
+			STRCAT(strFileName, 256, "rgb");	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__)
+			pFile = FOPEN(strFileName, "ab+");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+			FOPEN(&pFile, strFileName, "ab+");
+#else
+			pFile = FOPEN(strFileName, "ab+");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER			
+#endif//__GNUC__
+			if ( NULL != pFile )
+			{			
+				fwrite( pSrc, sizeof(uint8_t), iDataLength * 3, pFile );
+				fflush( pFile );
+				fclose( pFile );
+			}
+		case videoFormatBGR:
+			STRCAT(strFileName, 256, "bgr");	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__)
+			pFile = FOPEN(strFileName, "ab+");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+			FOPEN(&pFile, strFileName, "ab+");
+#else
+			pFile = FOPEN(strFileName, "ab+");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER			
+#endif//__GNUC__
+			//				WelsLog( m_pEncContext, WELS_LOG_INFO, "WELS_CSP_BGR, m_iCspInternal= 0x%x\n", m_iCspInternal);
+			if ( NULL != pFile )
+			{
+				fwrite( pSrc, sizeof(uint8_t), iDataLength * 3, pFile );
+				fflush( pFile );
+				fclose( pFile );
+			}			
+			break;
+		case videoFormatYUY2:
+			STRCAT(strFileName, 256, "yuy2");	// confirmed_safe_unsafe_usage
+#if defined(__GNUC__)
+			pFile = FOPEN(strFileName, "ab+");
+#else
+#if defined(_MSC_VER)
+#if _MSC_VER>=1500
+			FOPEN(&pFile, strFileName, "ab+");
+#else
+			pFile = FOPEN(strFileName, "ab+");
+#endif//_MSC_VER>=1500
+#endif//_MSC_VER			
+#endif//__GNUC__
+			if ( NULL != pFile )
+			{
+				fwrite( pSrc, sizeof(uint8_t), (CALC_BI_STRIDE(m_iMaxPicWidth,  16)) * m_iMaxPicHeight, pFile );
+				fflush( pFile );
+				fclose( pFile );
+			}			
+			break;
+		default:
+			WelsLog( m_pEncContext, WELS_LOG_INFO, "Exclusive case, m_iCspInternal= 0x%x\n", m_iCspInternal);
+			break;
+	}
+#endif//DUMP_SRC_PICTURE
+	return;
+}
+}
+
+using namespace WelsSVCEnc;
+
+int32_t CreateSVCEncoder(ISVCEncoder** ppEncoder)
+{
+	assert( ppEncoder );
+
+	if ( NULL == ppEncoder )
+		return 1;	
+
+	if( ( *ppEncoder = new CWelsH264SVCEncoder() ) != NULL )
+	{		
+		return 0;
+	}
+
+	return 1;
+}
+
+void DestroySVCEncoder(ISVCEncoder* pEncoder)
+{
+	CWelsH264SVCEncoder *pSVCEncoder = (CWelsH264SVCEncoder*)pEncoder;
+
+	if( pSVCEncoder ){		
+		delete pSVCEncoder;
+		pSVCEncoder = NULL;
+	}
+}
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
--- /dev/null
+++ b/codec/encoder/plus/src/wels_enc_export.def
@@ -1,0 +1,3 @@
+EXPORTS
+    CreateSVCEncoder
+    DestroySVCEncoder
\ No newline at end of file
--- /dev/null
+++ b/processing/build/linux/makefile
@@ -1,0 +1,94 @@
+NASM = 1
+NAME      = libwelsvp
+
+OUTDIR    = ../../../bin/linux
+BINDIR    = ../../bin
+OBJDIR    = ../../obj  
+SRCDIRS   = ../../src/asm \
+            ../../src/common \
+            ../../src/adaptivequantization \
+            ../../src/backgounddetection \
+            ../../src/denoise \
+            ../../src/downsample \
+            ../../src/scenechangedetection \
+            ../../src/vaacalc \
+            ../../src/complexityanalysis 
+SRCDIRS  += ../../src/imagerotate
+
+
+TARGETLIB =  $(BINDIR)/$(NAME).so
+
+CC        = $(shell which gcc)
+AS        = $(shell which nasm)
+GCC       = gcc -m32
+
+CPPFLAGS  = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
+LDFLAGS   = -lstdc++ -ldl
+          
+SRCEXTS  = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS  = .h
+SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP  = $(filter %.cpp,$(SOURCES))
+SRC_ASM  = $(filter %.asm,$(SOURCES))
+OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS     = $(OBJS:.o=.d)
+
+DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+                  echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm   = $(AS)  $(ASMFLAGS)
+LINK          = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+	
+%.d:%.cpp
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_cpp.d) $< >> $@
+	
+%.d:%.asm
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+	$(COMPILE.cpp) $< -o $@
+	
+%.o:%.asm
+	$(COMPILE.asm) $< -o $@	
+
+tags: $(HEADERS) $(SOURCES)
+	etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+	ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+	@echo produce the lib to $(TARGETLIB).
+	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+	@cp -f $(TARGETLIB) $(OUTDIR)
+	@cp -f $(TARGETLIB) ../../../testbin
+	@echo copy the lib to $(OUTDIR).
+
+clean:
+	rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+	rm -f $(DEPS) TAGS
+
--- /dev/null
+++ b/processing/build/win32/WelsVP_2008.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
binary files /dev/null b/processing/build/win32/WelsVP_2008.suo differ
--- /dev/null
+++ b/processing/build/win32/WelsVP_2008.vcproj
@@ -1,0 +1,576 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="WelsVP"
+	ProjectGUID="{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+	RootNamespace="WelsVP"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+		<DefaultToolFile
+			FileName="masm.rules"
+		/>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory=".\..\..\..\bin\win32\Debug"
+			IntermediateDirectory=".\..\..\..\obj\vp\Debug"
+			ConfigurationType="2"
+			CharacterSet="1"
+			WholeProgramOptimization="0"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories=""
+				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="1"
+				UsePrecompiledHeader="0"
+				AssemblerListingLocation=""
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkLibraryDependencies="true"
+				OutputFile="$(OutDir)\welsvp.dll"
+				LinkIncremental="2"
+				ModuleDefinitionFile="../../src/common/WelsVP.def"
+				GenerateDebugInformation="true"
+				GenerateMapFile="true"
+				MapFileName="$(OutDir)\welsvp.map"
+				SubSystem="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				CommandLine=""
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory=".\..\..\..\bin\win32\Release"
+			IntermediateDirectory=".\..\..\..\obj\vp\Release"
+			ConfigurationType="2"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+				CommandLine=""
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				EnableIntrinsicFunctions="false"
+				FavorSizeOrSpeed="1"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+				RuntimeLibrary="0"
+				EnableFunctionLevelLinking="false"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="0"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				OutputFile="$(OutDir)\welsvp.dll"
+				LinkIncremental="1"
+				GenerateManifest="false"
+				EnableUAC="false"
+				ModuleDefinitionFile="../../src/common/WelsVP.def"
+				GenerateDebugInformation="false"
+				GenerateMapFile="false"
+				MapFileName=""
+				MapExports="false"
+				SubSystem="2"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				CommandLine=""
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath="..\..\src\common\cpu.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\memory.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\thread.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\util.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\WelsFrameWork.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\WelsFrameWorkEx.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Interface"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath="..\..\interface\IWelsVP.h"
+				>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCLCompilerTool"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\common\resource.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+			<File
+				RelativePath="..\..\src\common\WelsVP.def"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\WelsVP.rc"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			>
+			<File
+				RelativePath="..\..\src\common\cpu.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\memory.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\thread.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\typedef.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\util.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\version.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\common\WelsFrameWork.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="ASM"
+			>
+			<File
+				RelativePath="..\..\src\asm\asm_inc.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\cpuid.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\denoisefilter.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm   -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\downsample_bilinear.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\intra_pred.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\sad.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+			<File
+				RelativePath="..\..\src\asm\vaa.asm"
+				>
+				<FileConfiguration
+					Name="Debug|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+				<FileConfiguration
+					Name="Release|Win32"
+					>
+					<Tool
+						Name="VCCustomBuildTool"
+						CommandLine="nasm  -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						Outputs="$(IntDir)\$(InputName).obj"
+					/>
+				</FileConfiguration>
+			</File>
+		</Filter>
+		<Filter
+			Name="SceneChangeDetection"
+			>
+			<File
+				RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Denoise"
+			>
+			<File
+				RelativePath="..\..\src\denoise\denoise.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\denoise\denoise.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\denoise\denoise_filter.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="VAACalc"
+			>
+			<File
+				RelativePath="..\..\src\vaacalc\vaacalcfuncs.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\vaacalc\vaacalculation.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\vaacalc\vaacalculation.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="BackgroundDetection"
+			>
+			<File
+				RelativePath="..\..\src\backgounddetection\BackgroundDetection.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\backgounddetection\BackgroundDetection.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="AdaptiveQuantization"
+			>
+			<File
+				RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Downsample"
+			>
+			<File
+				RelativePath="..\..\src\downsample\downsample.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\downsample\downsample.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\downsample\downsamplefuncs.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="ComplexityAnalysis"
+			>
+			<File
+				RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="ImageRotate"
+			>
+			<File
+				RelativePath="..\..\src\imagerotate\imagerotate.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\imagerotate\imagerotate.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\imagerotate\imagerotatefuncs.cpp"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/processing/build/win32/WelsVP_2010.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
binary files /dev/null b/processing/build/win32/WelsVP_2010.suo differ
--- /dev/null
+++ b/processing/build/win32/WelsVP_2010.vcxproj
@@ -1,0 +1,255 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
+    <RootNamespace>WelsVP</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\bin\win32\Debug\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\vp\Debug\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+    <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\bin\win32\Release\</OutDir>
+    <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\vp\Release\</IntDir>
+    <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+    <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</GenerateManifest>
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+    <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsvp</TargetName>
+    <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsvp</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <AssemblerListingLocation>
+      </AssemblerListingLocation>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+    <Link>
+      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <SubSystem>Windows</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+    <CustomBuildStep>
+      <Command>
+      </Command>
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>
+      </DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+      <EnableUAC>false</EnableUAC>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <MapExports>true</MapExports>
+      <SubSystem>Windows</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\src\common\cpu.cpp" />
+    <ClCompile Include="..\..\src\common\memory.cpp" />
+    <ClCompile Include="..\..\src\common\thread.cpp" />
+    <ClCompile Include="..\..\src\common\util.cpp" />
+    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
+    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
+    <ClCompile Include="..\..\src\denoise\denoise.cpp" />
+    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
+    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
+    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
+    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
+    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
+    <ClCompile Include="..\..\src\downsample\downsample.cpp" />
+    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
+    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
+    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
+    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\interface\IWelsVP.h" />
+    <ClInclude Include="..\..\src\common\resource.h" />
+    <ClInclude Include="..\..\src\common\cpu.h" />
+    <ClInclude Include="..\..\src\common\memory.h" />
+    <ClInclude Include="..\..\src\common\thread.h" />
+    <ClInclude Include="..\..\src\common\typedef.h" />
+    <ClInclude Include="..\..\src\common\util.h" />
+    <ClInclude Include="..\..\src\common\version.h" />
+    <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
+    <ClInclude Include="..\..\src\denoise\denoise.h" />
+    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
+    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
+    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
+    <ClInclude Include="..\..\src\downsample\downsample.h" />
+    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
+    <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\src\common\WelsVP.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\src\asm\asm_inc.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\cpuid.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm   -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\sad.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\vaa.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/processing/build/win32/WelsVP_2012.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2012", "WelsVP_2012.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
binary files /dev/null b/processing/build/win32/WelsVP_2012.v11.suo differ
--- /dev/null
+++ b/processing/build/win32/WelsVP_2012.vcxproj
@@ -1,0 +1,248 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
+    <RootNamespace>WelsVP</RootNamespace>
+    <Keyword>Win32Proj</Keyword>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>DynamicLibrary</ConfigurationType>
+    <PlatformToolset>v110</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>.\..\..\..\bin\win32\Debug\</OutDir>
+    <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>welsvp</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>.\..\..\..\bin\win32\Release\</OutDir>
+    <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
+    <LinkIncremental>false</LinkIncremental>
+    <GenerateManifest>false</GenerateManifest>
+    <TargetName>welsvp</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <CustomBuildStep>
+      <Command />
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>true</MinimalRebuild>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+      <PrecompiledHeader />
+      <AssemblerListingLocation />
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+    </ClCompile>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+    <Link>
+      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <SubSystem>Windows</SubSystem>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <PreBuildEvent>
+      <Command>
+      </Command>
+    </PreBuildEvent>
+    <CustomBuildStep>
+      <Command />
+    </CustomBuildStep>
+    <ClCompile>
+      <Optimization>Full</Optimization>
+      <IntrinsicFunctions>false</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <PrecompiledHeader />
+      <WarningLevel>Level3</WarningLevel>
+      <DebugInformationFormat />
+    </ClCompile>
+    <Link>
+      <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+      <EnableUAC>false</EnableUAC>
+      <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <GenerateMapFile>true</GenerateMapFile>
+      <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+      <MapExports>true</MapExports>
+      <SubSystem>Windows</SubSystem>
+      <OptimizeReferences>true</OptimizeReferences>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <TargetMachine>MachineX86</TargetMachine>
+      <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+      <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+      <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+    </Link>
+    <PostBuildEvent>
+      <Command>
+      </Command>
+    </PostBuildEvent>
+    <Bscmake>
+      <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+    </Bscmake>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\src\common\cpu.cpp" />
+    <ClCompile Include="..\..\src\common\memory.cpp" />
+    <ClCompile Include="..\..\src\common\thread.cpp" />
+    <ClCompile Include="..\..\src\common\util.cpp" />
+    <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
+    <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
+    <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
+    <ClCompile Include="..\..\src\denoise\denoise.cpp" />
+    <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
+    <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
+    <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
+    <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
+    <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
+    <ClCompile Include="..\..\src\downsample\downsample.cpp" />
+    <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
+    <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
+    <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
+    <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\interface\IWelsVP.h" />
+    <ClInclude Include="..\..\src\common\resource.h" />
+    <ClInclude Include="..\..\src\common\cpu.h" />
+    <ClInclude Include="..\..\src\common\memory.h" />
+    <ClInclude Include="..\..\src\common\thread.h" />
+    <ClInclude Include="..\..\src\common\typedef.h" />
+    <ClInclude Include="..\..\src\common\util.h" />
+    <ClInclude Include="..\..\src\common\version.h" />
+    <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
+    <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
+    <ClInclude Include="..\..\src\denoise\denoise.h" />
+    <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
+    <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
+    <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
+    <ClInclude Include="..\..\src\downsample\downsample.h" />
+    <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
+    <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\..\src\common\WelsVP.def" />
+  </ItemGroup>
+  <ItemGroup>
+    <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\src\asm\asm_inc.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\cpuid.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm   -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\sad.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\src\asm\vaa.asm">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm  -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+    </CustomBuild>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+  </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/processing/build/win32/WelsVideoProcessor.sln
@@ -1,0 +1,29 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVideoProcessor", "WelsVideoProcessor.vcproj", "{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
+	ProjectSection(ProjectDependencies) = postProject
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release|Win32 = Release|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.ActiveCfg = Debug|Win32
+		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.Build.0 = Debug|Win32
+		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.ActiveCfg = Release|Win32
+		{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.Build.0 = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+		{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/processing/build/win32/WelsVideoProcessor.vcproj
@@ -1,0 +1,213 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="WelsVideoProcessor"
+	ProjectGUID="{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
+	RootNamespace="WelsVideoProcessor"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
+			IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="1"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
+			IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="0"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath="..\..\src\testbed\stdafx.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\testbed\wels_process.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\testbed\WelsVideoProcessor.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath="..\..\src\testbed\stdafx.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\testbed\targetver.h"
+				>
+			</File>
+			<File
+				RelativePath="..\..\src\testbed\wels_process.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Resource Files"
+			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+			>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/processing/interface/IWelsVP.h
@@ -1,0 +1,304 @@
+/*!
+ * \copy
+ *     Copyright (c)  2004-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	    :  IWelsVP.h
+ *
+ * \brief	    :  Interface of wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  1. should support both C/C++ style interface
+ *                 2. should concern with the feature extension requirement 
+ *                 3. should care the usage of "char"==>
+ *                     1) value char  : signed char/unsigned char
+ *                     2) string char : char
+ *
+ *************************************************************************************
+ */
+
+#ifndef _IWELSVP_H_
+#define _IWELSVP_H_ 
+
+#ifdef _WIN32
+#define WELSAPI __stdcall
+#else
+#define WELSAPI 
+#endif
+
+#define WELSVP_MAJOR_VERSION   1
+#define WELSVP_MINOR_VERSION   1
+#define WELSVP_VERSION         ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
+
+typedef enum 
+{
+	RET_SUCCESS          =  0,
+	RET_FAILED           = -1,
+	RET_INVALIDPARAM     = -2,
+	RET_OUTOFMEMORY      = -3,
+	RET_NOTSUPPORTED       = -4,
+	RET_UNEXPECTED       = -5,
+	RET_NEEDREINIT		  = -6
+} EResult;
+
+typedef enum 
+{ 
+	VIDEO_FORMAT_NULL       = 0,   /* invalid format   */
+	/*rgb color formats*/
+	VIDEO_FORMAT_RGB        = 1,   /* rgb 24bits       */
+	VIDEO_FORMAT_RGBA       = 2,   /* rgba             */
+	VIDEO_FORMAT_RGB555     = 3,   /* rgb555           */
+	VIDEO_FORMAT_RGB565     = 4,   /* rgb565           */
+	VIDEO_FORMAT_BGR        = 5,   /* bgr 24bits       */
+	VIDEO_FORMAT_BGRA       = 6,   /* bgr 32bits       */
+	VIDEO_FORMAT_ABGR       = 7,   /* abgr             */
+	VIDEO_FORMAT_ARGB       = 8,   /* argb             */
+
+	/*yuv color formats*/
+	VIDEO_FORMAT_YUY2       = 20,   /* yuy2             */
+	VIDEO_FORMAT_YVYU       = 21,   /* yvyu             */
+	VIDEO_FORMAT_UYVY       = 22,   /* uyvy             */
+	VIDEO_FORMAT_I420       = 23,   /* yuv 4:2:0 planar */              
+	VIDEO_FORMAT_YV12       = 24,   /* yuv 4:2:0 planar */
+	VIDEO_FORMAT_INTERNAL   = 25,   /* Only Used for SVC decoder testbed */ 
+	VIDEO_FORMAT_NV12		= 26,	/* y planar + uv packed */
+	VIDEO_FORMAT_I422       = 27,   /* yuv 4:2:2 planar */
+	VIDEO_FORMAT_I444       = 28,   /* yuv 4:4:4 planar */
+	VIDEO_FORMAT_YUYV       = 20,   /* yuv 4:2:2 packed */
+
+	VIDEO_FORMAT_RGB24      = 1,
+	VIDEO_FORMAT_RGB32      = 2,
+	VIDEO_FORMAT_RGB24_INV  = 5,
+	VIDEO_FORMAT_RGB32_INV  = 6,
+	VIDEO_FORMAT_RGB555_INV = 7,
+	VIDEO_FORMAT_RGB565_INV = 8,
+	VIDEO_FORMAT_YUV2       = 21,
+	VIDEO_FORMAT_420        = 23,
+
+	VIDEO_FORMAT_VFlip      = 0x80000000 
+} EVideoFormat;
+
+typedef enum 
+{ 
+	BUFFER_HOSTMEM  = 0,
+	BUFFER_SURFACE
+} EPixMapBufferProperty;
+
+typedef struct
+{
+  int iRectTop;
+  int iRectLeft;
+  int iRectWidth;
+  int iRectHeight;
+} SRect;
+
+typedef struct
+{
+	void        *pPixel[3]; 
+	int          iSizeInBits;
+	int          iStride[3];
+	SRect        sRect;	
+	EVideoFormat eFormat;
+	EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
+} SPixMap;
+
+typedef enum
+{	
+	METHOD_NULL              = 0,
+	METHOD_COLORSPACE_CONVERT    ,//not support yet
+	METHOD_DENOISE              ,
+	METHOD_SCENE_CHANGE_DETECTION ,
+	METHOD_DOWNSAMPLE			  ,
+	METHOD_VAA_STATISTICS        ,
+    METHOD_BACKGROUND_DETECTION  ,
+	METHOD_ADAPTIVE_QUANT ,
+	METHOD_COMPLEXITY_ANALYSIS   ,
+	METHOD_IMAGE_ROTATE		  ,
+	METHOD_MASK                 
+} EMethods;
+
+//-----------------------------------------------------------------//
+//  Algorithm parameters define
+//-----------------------------------------------------------------//
+
+typedef struct
+{
+	int bSceneChangeFlag; // 0:false ; 1:true
+} SSceneChangeResult;
+
+typedef enum
+{
+	SIMILAR_SCENE,      //similar scene 
+	MEDIUM_CHANGED_SCENE,   //medium changed scene
+	LARGE_CHANGED_SCENE,   //large changed scene
+} ESceneChangeIdc;
+
+typedef struct
+{
+	unsigned char *pCurY;					// Y data of current frame
+	unsigned char *pRefY;					// Y data of pRef frame for diff calc
+	int (*pSad8x8)[4];				// sad of 8x8, every 4 in the same 16x16 get together
+	int *pSsd16x16;					// sum of square difference of 16x16
+	int *pSum16x16;					// sum of 16x16
+	int *pSumOfSquare16x16;					// sum of square of 16x16
+	int	(*pSumOfDiff8x8)[4];
+	unsigned char	(*pMad8x8)[4];
+	int iFrameSad;					// sad of frame
+} SVAACalcResult;
+
+typedef struct
+{
+	int iCalcVar;
+	int iCalcBgd;
+	int iCalcSsd;
+	int iReserved;
+	SVAACalcResult	*pCalcResult;
+} SVAACalcParam;
+
+typedef struct
+{
+	signed char		*pBackgroundMbFlag;
+	SVAACalcResult  *pCalcRes;
+} SBGDInterface;
+
+typedef enum
+{
+	AQ_QUALITY_MODE,   //Quality mode
+	AQ_BITRATE_MODE,   //Bitrate mode
+}EAQModes;
+
+typedef struct 
+{
+	unsigned short    uiMotionIndex;
+	unsigned short    uiTextureIndex;
+} SMotionTextureUnit;
+
+typedef struct
+{
+	int					iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
+	SVAACalcResult		*pCalcResult;
+	SMotionTextureUnit  *pMotionTextureUnit;
+
+	signed char			*pMotionTextureIndexToDeltaQp;	
+	double				dAverMotionTextureIndexToDeltaQp;
+} SAdaptiveQuantizationParam;
+
+typedef enum 
+{
+	FRAME_SAD     =  0,
+	GOM_SAD       = -1,
+	GOM_VAR       = -2
+} EComplexityAnalysisMode;
+
+typedef struct
+{
+	int  iComplexityAnalysisMode;
+	int  iCalcBgd;
+	int  iMbNumInGom;		
+	int  iFrameComplexity;
+	int  *pGomComplexity;
+	int  *pGomForegroundBlockNum;
+	signed char  *pBackgroundMbFlag;
+	unsigned int *uiRefMbType;
+	SVAACalcResult  *pCalcResult;
+} SComplexityAnalysisParam;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct 
+{
+	void    *pCtx;
+	EResult (*Init)    (void *pCtx, int iType, void *pCfg);
+	EResult (*Uninit)  (void *pCtx, int iType);
+	EResult (*Flush)   (void *pCtx, int iType);
+	EResult (*Process) (void *pCtx, int iType, SPixMap *pSrc, SPixMap *dst); 
+	EResult (*Get)     (void *pCtx, int iType, void *pParam); 
+	EResult (*Set)     (void *pCtx, int iType, void *pParam); 
+	EResult (*SpecialFeature) (void *pCtx, int iType, void *pIn, void *pOut);
+} IWelsVPc;
+
+#if defined(__cplusplus) && !defined(CINTERFACE)  /* C++ style interface */
+
+class IWelsVP
+{
+public:
+	virtual ~IWelsVP() {}
+
+public:		
+	virtual EResult Init    (int iType, void *pCfg) = 0; 
+	virtual EResult Uninit  (int iType) = 0;
+	virtual EResult Flush   (int iType) = 0;
+	virtual EResult Process (int iType, SPixMap *pSrc, SPixMap *dst) = 0; 
+	virtual EResult Get     (int iType, void *pParam) = 0; 
+	virtual EResult Set     (int iType, void *pParam) = 0; 
+	virtual EResult SpecialFeature (int iType, void *pIn, void *pOut) = 0;
+};
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(a, b)              
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(a)               
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(a)                
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(a, b, c)        
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(a, b)               
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(a, b)               
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(a, b, c)
+
+/* C++ interface version */
+#define WELSVP_INTERFACE_VERION                    (0x8000 + (WELSVP_VERSION & 0x7fff)) 
+#define WELSVP_EXTERNC_BEGIN                       extern "C" {
+#define WELSVP_EXTERNC_END                         }
+
+#else    /* C style interface */
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b)                  (p)->Init(p->h, a, b)              
+#define IWelsVPFunc_Uninit(p, a)                   (p)->Uninit(p->h, a)               
+#define IWelsVPFunc_Flush(p, a)                    (p)->Flush(p->h, a)                
+#define IWelsVPFunc_Process(p, a, b, c)            (p)->Process(p->h, a, b, c)        
+#define IWelsVPFunc_Get(p, a, b)                   (p)->Get(p->h, a, b)               
+#define IWelsVPFunc_Set(p, a, b)                   (p)->Set(p->h, a, b)               
+#define IWelsVPFunc_SpecialFeature(p, a, b, c)     (p)->SpecialFeature(p->h, a, b, c) 
+
+/* C interface version */
+#define WELSVP_INTERFACE_VERION                    (0x0001 + (WELSVP_VERSION & 0x7fff)) 
+#define WELSVP_EXTERNC_BEGIN                      
+#define WELSVP_EXTERNC_END                       
+
+#endif
+
+WELSVP_EXTERNC_BEGIN
+EResult WELSAPI CreateVpInterface   (void **ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
+EResult WELSAPI DestroyVpInterface  (void *pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
+WELSVP_EXTERNC_END
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+#endif // _IWELSVP_H_
+
+
--- /dev/null
+++ b/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -1,0 +1,281 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include "AdaptiveQuantization.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+
+#define AVERAGE_TIME_MOTION                   (0.3) //0.3046875 // 1/4 + 1/16 - 1/128 ~ 0.3
+#define AVERAGE_TIME_TEXTURE_QUALITYMODE  (1.0) //0.5 // 1/2
+#define AVERAGE_TIME_TEXTURE_BITRATEMODE  (0.875) //0.5 // 1/2
+#define MODEL_ALPHA                           (0.9910) //1.5 //1.1102
+#define MODEL_TIME                            (5.8185) //9.0 //5.9842
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CAdaptiveQuantization::CAdaptiveQuantization(int32_t iCpuFlag)
+{
+	m_CPUFlag = iCpuFlag;
+	m_eMethod   = METHOD_ADAPTIVE_QUANT;
+	m_pfVar   = NULL;
+	WelsMemset( &m_sAdaptiveQuantParam, 0, sizeof(m_sAdaptiveQuantParam) );
+	WelsInitVarFunc(m_pfVar, m_CPUFlag);
+}
+
+CAdaptiveQuantization::~CAdaptiveQuantization()
+{	
+}
+
+EResult CAdaptiveQuantization::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
+{
+	EResult eReturn = RET_INVALIDPARAM;	
+
+	int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+	int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;	
+	int32_t iMbWidth  = iWidth  >> 4;
+	int32_t iMbHeight = iHeight >> 4;
+	int32_t iMbTotalNum    = iMbWidth * iMbHeight;
+
+	SMotionTextureUnit *pMotionTexture = NULL;
+	SVAACalcResult     *pVaaCalcResults = NULL;
+	int8_t   iMotionTextureIndexToDeltaQp = 0;	
+	int32_t	 iAverMotionTextureIndexToDeltaQp = 0;	// double to uint32
+	double_t dAverageMotionIndex = 0.0;	// double to float
+	double_t dAverageTextureIndex = 0.0;
+
+	double_t dQStep = 0.0;
+	double_t dLumaMotionDeltaQp = 0;
+	double_t dLumaTextureDeltaQp = 0;
+
+	uint8_t *pRefFrameY = NULL, *pCurFrameY = NULL;
+	int32_t iRefStride = 0, iCurStride = 0;
+
+	uint8_t *pRefFrameTmp = NULL, *pCurFrameTmp = NULL;
+	int32_t i = 0, j = 0;
+
+	pRefFrameY = (uint8_t *)pRefPixMap->pPixel[0];
+	pCurFrameY = (uint8_t *)pSrcPixMap->pPixel[0];
+
+	iRefStride  = pRefPixMap->iStride[0];
+	iCurStride  = pSrcPixMap->iStride[0];
+
+	/////////////////////////////////////// motion //////////////////////////////////
+	//  motion MB residual variance
+	dAverageMotionIndex = 0.0;
+	dAverageTextureIndex = 0.0;
+	pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
+	pVaaCalcResults = m_sAdaptiveQuantParam.pCalcResult;
+
+	if ( pVaaCalcResults->pRefY == pRefFrameY && pVaaCalcResults->pCurY == pCurFrameY )
+	{
+		int32_t iMbIndex = 0;
+		int32_t iSumDiff, iSQDiff, uiSum, iSQSum;
+		for ( j = 0; j < iMbHeight; j ++ ) 
+		{
+			pRefFrameTmp  = pRefFrameY;
+			pCurFrameTmp  = pCurFrameY;	
+			for ( i = 0; i < iMbWidth; i++ )
+			{
+				iSumDiff =  pVaaCalcResults->pSad8x8[iMbIndex][0];
+				iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
+				iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
+				iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][3];
+
+				iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
+				uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
+				iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
+
+				iSumDiff = iSumDiff>>8;
+				pMotionTexture->uiMotionIndex = (iSQDiff>>8) - (iSumDiff * iSumDiff);
+
+				uiSum = uiSum>>8;
+				pMotionTexture->uiTextureIndex = (iSQSum>>8) - (uiSum * uiSum);
+
+				dAverageMotionIndex += pMotionTexture->uiMotionIndex;
+				dAverageTextureIndex += pMotionTexture->uiTextureIndex;
+				pMotionTexture++;
+				++iMbIndex;
+				pRefFrameTmp += MB_WIDTH_LUMA;
+				pCurFrameTmp += MB_WIDTH_LUMA;
+			}
+			pRefFrameY += (iRefStride)<<4;
+			pCurFrameY += (iCurStride)<<4;
+		}
+	}
+	else 
+	{
+		for ( j = 0; j < iMbHeight; j ++ ) 
+		{
+			pRefFrameTmp  = pRefFrameY;
+			pCurFrameTmp  = pCurFrameY;	
+			for ( i = 0; i < iMbWidth; i++ )
+			{
+				m_pfVar( pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
+				dAverageMotionIndex += pMotionTexture->uiMotionIndex;
+				dAverageTextureIndex += pMotionTexture->uiTextureIndex;
+				pMotionTexture++;
+				pRefFrameTmp += MB_WIDTH_LUMA;
+				pCurFrameTmp += MB_WIDTH_LUMA;
+
+			}
+			pRefFrameY += (iRefStride)<<4;
+			pCurFrameY += (iCurStride)<<4;
+		}
+	}
+	dAverageMotionIndex = dAverageMotionIndex / iMbTotalNum;
+	dAverageTextureIndex = dAverageTextureIndex / iMbTotalNum;
+	if ( (dAverageMotionIndex <= PESN) && (dAverageMotionIndex >= -PESN) )
+	{
+		dAverageMotionIndex = 1.0;
+	}
+	if ( (dAverageTextureIndex <= PESN) && (dAverageTextureIndex >= -PESN) )
+	{
+		dAverageTextureIndex = 1.0;
+	}
+	//  motion mb residual map to QP
+	//  texture mb original map to QP	
+	iAverMotionTextureIndexToDeltaQp = 0;
+	dAverageMotionIndex = AVERAGE_TIME_MOTION * dAverageMotionIndex;
+
+	if ( m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE )
+	{
+		dAverageTextureIndex = AVERAGE_TIME_TEXTURE_QUALITYMODE * dAverageTextureIndex;
+	}
+	else
+	{
+		dAverageTextureIndex = AVERAGE_TIME_TEXTURE_BITRATEMODE * dAverageTextureIndex;
+	}
+
+	pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
+	for ( j = 0; j < iMbHeight; j ++ ) 
+	{
+		for ( i = 0; i < iMbWidth; i++ )
+		{
+			double_t a = pMotionTexture->uiTextureIndex / dAverageTextureIndex;
+			dQStep = (a - 1) / (a + MODEL_ALPHA); 		
+			dLumaTextureDeltaQp = MODEL_TIME * dQStep;// range +- 6
+
+			iMotionTextureIndexToDeltaQp = (int8_t)dLumaTextureDeltaQp;
+
+			a = pMotionTexture->uiMotionIndex / dAverageMotionIndex;
+			dQStep = (a - 1) / (a + MODEL_ALPHA); 			
+			dLumaMotionDeltaQp = MODEL_TIME * dQStep;// range +- 6
+
+			if ((m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE && dLumaMotionDeltaQp < -PESN) || (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_BITRATE_MODE))
+			{
+				iMotionTextureIndexToDeltaQp += (int8_t)dLumaMotionDeltaQp;
+			}
+
+			m_sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[j * iMbWidth + i] = iMotionTextureIndexToDeltaQp;
+			iAverMotionTextureIndexToDeltaQp += iMotionTextureIndexToDeltaQp;
+			pMotionTexture++;
+		}
+	}
+	m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = (1.0 * iAverMotionTextureIndexToDeltaQp) / iMbTotalNum;
+
+	eReturn = RET_SUCCESS;
+
+	return eReturn;
+}
+
+
+
+EResult CAdaptiveQuantization::Set(int32_t iType, void *pParam)
+{
+	if (pParam == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	m_sAdaptiveQuantParam = *(SAdaptiveQuantizationParam *)pParam;
+
+	return RET_SUCCESS;
+}
+
+EResult CAdaptiveQuantization::Get(int32_t iType, void *pParam)
+{
+	if (pParam == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	SAdaptiveQuantizationParam * sAdaptiveQuantParam = (SAdaptiveQuantizationParam *)pParam;
+
+	sAdaptiveQuantParam->dAverMotionTextureIndexToDeltaQp = m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp;
+
+	return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void CAdaptiveQuantization::WelsInitVarFunc(PVarFunc &pfVar,  int32_t iCpuFlag)
+{
+	pfVar = SampleVariance16x16_c;
+
+#ifdef X86_ASM	
+	if ( iCpuFlag & WELS_CPU_SSE2 )
+	{
+		pfVar = SampleVariance16x16_sse2;
+	}
+#endif
+}
+
+void SampleVariance16x16_c( uint8_t * pRefY, int32_t iRefStride, uint8_t * pSrcY, int32_t iSrcStride, SMotionTextureUnit* pMotionTexture )
+{
+	uint32_t uiCurSquare = 0,  uiSquare = 0;
+	uint16_t uiCurSum = 0,  uiSum = 0;
+
+	for( int32_t y = 0; y < MB_WIDTH_LUMA; y++ )
+	{
+		for( int32_t x = 0; x < MB_WIDTH_LUMA; x++ )
+		{
+			uint32_t uiDiff = WELS_ABS(pRefY[x] - pSrcY[x]);	
+			uiSum += uiDiff;
+			uiSquare += uiDiff * uiDiff;
+
+			uiCurSum += pSrcY[x];
+			uiCurSquare += pSrcY[x] * pSrcY[x];
+		}
+		pRefY += iRefStride;
+		pSrcY += iSrcStride;
+	}
+
+	uiSum = uiSum>>8;
+	pMotionTexture->uiMotionIndex = (uiSquare>>8) - (uiSum * uiSum);
+
+	uiCurSum = uiCurSum>>8;
+	pMotionTexture->uiTextureIndex = (uiCurSquare>>8) - (uiCurSum * uiCurSum);
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -1,0 +1,85 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	        :  AdaptiveQuantization.h
+ *
+ * \brief	    :  adaptive quantization class of wels video processor class
+ *
+ * \date         :  2011/03/21
+ *
+ * \description  :  1. rewrite the package code of scene change detection class  
+ *
+ */
+
+#ifndef _WELSVP_ADAPTIVEQUANTIZATION_H
+#define _WELSVP_ADAPTIVEQUANTIZATION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (VarFunc) ( uint8_t * pRefY, int32_t iRefStrideY, uint8_t * pSrc, int32_t iSrcStrideY, SMotionTextureUnit* pMotionTexture );
+
+typedef VarFunc  * PVarFunc;
+
+VarFunc      SampleVariance16x16_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+VarFunc      SampleVariance16x16_sse2;
+WELSVP_EXTERN_C_END
+#endif
+
+
+class CAdaptiveQuantization : public IStrategy
+{			  
+public:
+	CAdaptiveQuantization(int32_t iCpuFlag);
+	~CAdaptiveQuantization();
+
+	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pRef);
+	EResult Set(int32_t iType, void *pParam);
+	EResult Get(int32_t iType, void *pParam);
+
+private:
+	void WelsInitVarFunc(PVarFunc &pfVar, int32_t iCpuFlag);
+
+private:
+	PVarFunc			                   m_pfVar;
+	int32_t                                  m_CPUFlag;
+	SAdaptiveQuantizationParam    m_sAdaptiveQuantParam;
+};	
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/asm/asm_inc.asm
@@ -1,0 +1,235 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  sse2inc.asm
+;*
+;*  Abstract
+;*      macro and constant
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+;***********************************************************************
+; Options, for DEBUG
+;***********************************************************************
+
+%if 1 
+	%define MOVDQ movdqa
+%else
+	%define MOVDQ movdqu
+%endif
+
+%if 1
+	%define WELSEMMS	emms
+%else
+	%define WELSEMMS
+%endif
+
+BITS 32
+
+;***********************************************************************
+; Macros 
+;***********************************************************************
+
+%macro WELS_EXTERN 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+%macro WELS_AbsW 2
+	pxor        %2, %2
+    psubw       %2, %1
+    pmaxsw      %1, %2
+%endmacro 	
+
+%macro MMX_XSwap  4
+    movq		%4, %2
+    punpckh%1   %4, %3
+    punpckl%1   %2, %3
+%endmacro
+
+; pOut mm1, mm4, mm5, mm3
+%macro MMX_Trans4x4W 5
+    MMX_XSwap wd, %1, %2, %5
+    MMX_XSwap wd, %3, %4, %2
+    MMX_XSwap dq, %1, %3, %4
+    MMX_XSwap dq, %5, %2, %3
+%endmacro
+
+;for TRANSPOSE
+%macro SSE2_XSawp 4
+    movdqa      %4, %2
+    punpckl%1   %2, %3
+    punpckh%1   %4, %3
+%endmacro
+
+; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
+%macro SSE2_Trans4x4D 5
+    SSE2_XSawp dq,  %1, %2, %5
+    SSE2_XSawp dq,  %3, %4, %2
+    SSE2_XSawp qdq, %1, %3, %4
+    SSE2_XSawp qdq, %5, %2, %3
+%endmacro
+
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+%macro SSE2_TransTwo4x4W 5
+    SSE2_XSawp wd,  %1, %2, %5
+    SSE2_XSawp wd,  %3, %4, %2
+    SSE2_XSawp dq,  %1, %3, %4
+    SSE2_XSawp dq,  %5, %2, %3
+    SSE2_XSawp qdq, %1, %5, %2
+    SSE2_XSawp qdq, %4, %3, %5
+%endmacro
+
+;in:  m1, m2, m3, m4, m5, m6, m7, m8
+;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+%macro SSE2_TransTwo8x8B 9
+	movdqa	%9,	%8
+	SSE2_XSawp bw,  %1, %2, %8
+	SSE2_XSawp bw,  %3, %4, %2
+	SSE2_XSawp bw,  %5, %6, %4
+	movdqa	%6, %9
+	movdqa	%9, %4
+	SSE2_XSawp bw,  %7, %6, %4
+	
+	SSE2_XSawp wd,  %1, %3, %6	
+	SSE2_XSawp wd,  %8, %2, %3
+	SSE2_XSawp wd,  %5, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %3	
+	SSE2_XSawp wd,  %7, %4, %3
+	
+	SSE2_XSawp dq,  %1, %5, %4	
+	SSE2_XSawp dq,  %6, %2, %5
+	SSE2_XSawp dq,  %8, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %5		
+	SSE2_XSawp dq,  %7, %3, %5
+	
+	SSE2_XSawp qdq,  %1, %8, %3
+	SSE2_XSawp qdq,  %4, %2, %8
+	SSE2_XSawp qdq,  %6, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %1		
+	SSE2_XSawp qdq,  %7, %5, %1
+	movdqa	%5, %9
+%endmacro
+
+;xmm0, xmm6, xmm7, [eax], [ecx]
+;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
+%macro SSE2_LoadDiff8P 5
+    movq         %1, %4
+    punpcklbw    %1, %3
+    movq         %2, %5
+    punpcklbw    %2, %3
+    psubw        %1, %2
+%endmacro
+
+; m2 = m1 + m2, m1 = m1 - m2
+%macro SSE2_SumSub 3
+	movdqa  %3, %2
+    paddw   %2, %1
+    psubw   %1, %3
+%endmacro
+
+
+%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+	mov %3h, %3l
+	movd %1, e%3x		; i.e, 1% = eax (=b0)
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
+%endmacro  
+
+;copy a dw into a xmm for 8 times
+%macro  SSE2_Copy8Times 2
+		movd	%1, %2
+		punpcklwd %1, %1
+		pshufd	%1,	%1,	0
+%endmacro
+
+;copy a db into a xmm for 16 times
+%macro  SSE2_Copy16Times 2
+		movd		%1, %2
+		pshuflw		%1, %1, 0
+		punpcklqdq	%1, %1
+		packuswb	%1,	%1
+%endmacro
+
+
+
+;***********************************************************************
+;preprocessor constants
+;***********************************************************************
+;dw 32,32,32,32,32,32,32,32 for xmm
+;dw 32,32,32,32 for mm
+%macro WELS_DW32 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	psllw %1,5
+%endmacro
+
+;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
+;dw 1, 1, 1, 1 for mm
+%macro WELS_DW1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+%endmacro
+
+;all 0 for xmm and mm
+%macro	WELS_Zero 1
+	pxor %1, %1
+%endmacro
+
+;dd 1, 1, 1, 1 for xmm
+;dd 1, 1 for mm
+%macro WELS_DD1 1
+	pcmpeqw %1,%1
+	psrld %1,31
+%endmacro
+
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+%macro WELS_DB1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	packuswb %1,%1
+%endmacro
+
+
+
+
+
+
--- /dev/null
+++ b/processing/src/asm/cpuid.asm
@@ -1,0 +1,169 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	cpu_mmx.asm
+;*
+;*  Abstract
+;*		verify cpuid feature support and cpuid detection
+;*
+;*  History
+;*      04/29/2009	Created
+;*
+;*************************************************************************/
+
+bits 32
+
+;******************************************************************************************
+; Macros
+;******************************************************************************************
+
+%macro WELS_EXTERN 1
+	%ifdef PREFIX
+		global _%1
+		%define %1 _%1
+	%else
+		global %1
+	%endif
+%endmacro
+
+;******************************************************************************************
+; Code
+;******************************************************************************************
+
+SECTION .text
+
+; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
+; section CPUID - CPU Identification
+
+WELS_EXTERN WelsCPUIdVerify
+ALIGN 16
+;******************************************************************************************
+;   int32_t WelsCPUIdVerify()
+;******************************************************************************************
+WelsCPUIdVerify:
+    pushfd					; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
+	pushfd					; need push 2 EFLAGS, one for processing and the another one for storing purpose
+    pop     ecx				; get EFLAGS to bit manipulation
+    mov     eax, ecx		; store into ecx followed
+    xor     eax, 00200000h	; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
+	xor		eax, ecx		; get the ID flag bitwise, eax - 0: not support; otherwise: support
+    popfd					; store back EFLAGS and keep unchanged for system
+    ret
+
+WELS_EXTERN WelsCPUId
+ALIGN 16
+;****************************************************************************************************
+;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
+;****************************************************************************************************
+WelsCPUId:
+	push	ebx	
+	push	edi
+	
+	mov     eax, [esp+12]	; operating index
+    cpuid					; cpuid
+	
+	; processing various information return
+	mov     edi, [esp+16]
+    mov     [edi], eax
+    mov     edi, [esp+20]
+    mov     [edi], ebx
+    mov     edi, [esp+24]
+    mov     [edi], ecx
+    mov     edi, [esp+28]
+    mov     [edi], edx
+
+	pop		edi	
+    pop     ebx
+	ret
+	
+WELS_EXTERN WelsCPUSupportAVX
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportAVX:
+	mov eax, [esp+4]
+	mov ecx, [esp+8]
+
+	; refer to detection of AVX addressed in INTEL AVX manual document
+	and ecx, 018000000H
+	cmp ecx, 018000000H		; check both OSXSAVE and AVX feature flags
+	jne avx_not_supported
+	; processor supports AVX instructions and XGETBV is enabled by OS
+	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
+	XGETBV					; result in EDX:EAX
+	and eax, 06H
+	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
+	jne avx_not_supported
+	mov eax, 1
+	ret
+avx_not_supported:
+	mov eax, 0
+	ret
+
+WELS_EXTERN WelsCPUSupportFMA
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportFMA:
+	mov eax, [esp+4]
+	mov ecx, [esp+8]
+	
+	; refer to detection of FMA addressed in INTEL AVX manual document
+	and ecx, 018001000H
+	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
+	jne fma_not_supported
+	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
+	XGETBV					; result in EDX:EAX
+	and eax, 06H
+	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
+	jne fma_not_supported
+	mov eax, 1
+	ret
+fma_not_supported:
+	mov eax, 0	
+	ret
+
+WELS_EXTERN WelsEmms
+ALIGN 16
+;******************************************************************************************
+;   void WelsEmms()
+;******************************************************************************************
+WelsEmms:
+	emms	; empty mmx technology states
+	ret
+
+
+
--- /dev/null
+++ b/processing/src/asm/denoisefilter.asm
@@ -1,0 +1,263 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  predenoise.asm
+;*
+;*  Abstract
+;*      denoise for SVC2.1
+;*  History
+;*      4/13/2010 Created
+;*      7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+BITS 32
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+	
+%macro	WEIGHT_LINE	9
+		movq		%2,	%9
+		punpcklbw	%2,	%7
+		movdqa		%8,	%2
+		
+		movdqa		%1,	%6
+		psubusb		%1,	%8
+		psubusb		%8,	%6
+		por			%8,	%1		; ABS(curPixel - centerPixel);
+		
+		movdqa		%1,	%3
+		psubusb		%1,	%8
+
+		pmullw		%1,	%1
+		psrlw		%1,	5
+		pmullw		%2,	%1		
+		paddusw		%4,	%1
+		paddusw		%5,	%2	
+%endmacro
+
+%macro	WEIGHT_LINE1_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE2_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE3_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		pmullw		%2,	[sse2_20]
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;	1	2	3
+;	4	0	5
+;	6	7	8
+;	0:	the center point
+%define		pushsize	4
+%define		pixel		esp + pushsize + 4
+%define		stride		esp + pushsize + 8
+BilateralLumaFilter8_sse2:
+		push		ebx
+		
+		pxor		xmm7,	xmm7
+		mov			eax,	[pixel]
+		mov			ebx,	eax
+		movq		xmm6,	[eax]
+		punpcklbw	xmm6,	xmm7
+		movdqa		xmm3,	[sse2_32]
+		pxor		xmm4,	xmm4		; nTotWeight
+		pxor		xmm5,	xmm5		; nSum
+		
+		dec			eax
+		mov			ecx,	[stride]
+		
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
+		
+		sub			eax,	ecx
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
+		
+		lea			eax,	[eax + ecx * 2]
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
+		
+		pcmpeqw		xmm0,	xmm0
+		psrlw		xmm0,	15
+		psllw		xmm0,	8
+		psubusw		xmm0,	xmm4
+		pmullw		xmm0,	xmm6
+		paddusw		xmm5,	xmm0
+		psrlw		xmm5,	8
+		packuswb	xmm5,	xmm5
+		movq		[ebx],	xmm5		
+		
+		pop ebx
+		ret	
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1	1	2	1	1
+;1	2	4	2	1
+;2	4	20	4	2
+;1	2	4	2	1
+;1	1	2	1	1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+		mov		edx,	[esp + 4]	; pixels
+		mov		ecx,	[esp + 8]	; stride
+		
+		mov		eax,	ecx
+		add		eax,	eax
+		sub		edx,	eax			; pixels - 2 * stride
+		sub		edx,	2
+			
+		pxor	xmm0,	xmm0	
+		pxor	xmm3,	xmm3
+	
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+		
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
+		
+		add		edx,	eax	
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+		
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
+		
+		movdqu		xmm1,	[edx + ecx * 2]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0		
+	
+		psrlw		xmm3,		6
+		packuswb	xmm3,		xmm3
+		movq		[edx + 2],		xmm3			
+
+		ret	
\ No newline at end of file
--- /dev/null
+++ b/processing/src/asm/downsample_bilinear.asm
@@ -1,0 +1,1225 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	upsampling.asm
+;*
+;*  Abstract
+;*		SIMD for pixel domain down sampling
+;*
+;*  History
+;*		10/22/2009	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1			; iSrcHeight >> 1	
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		; 
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+	
+	; 2nd part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm1, [esi+16]		; 1st pSrc line + 16
+	movq mm2, [esi+24]		; 1st pSrc line + 24
+	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
+	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
+
+	; to handle mm1, mm2, mm3, mm4
+	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm5, mm6		; d c D C b a B A
+	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm6, mm7		; h g H G f e F E
+	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm7, mm1		; l k L K j i J I
+	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
+
+	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm1, mm2 		; p o P O n m N M
+	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
+
+	; to handle mm5, mm6, mm7, mm1
+	movq mm2, mm5
+	punpckldq mm2, mm6 	; H G F E D C B A
+	punpckhdq mm5, mm6 	; h g f e d c b a
+
+	movq mm3, mm7
+	punpckldq mm3, mm1 	; P O N M L K J I
+	punpckhdq mm7, mm1 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+	movq [edi  ], mm0
+	movq [edi+8], mm2
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1		; iSrcHeight >> 1	
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		; 
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movq [edi  ], mm0	
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx8_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1		; iSrcHeight >> 1	
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 8 bytes
+.xloops:
+	; 1st part horizonal loop: x8 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A
+	;2nd Line Src:	mm1: h H g G f F e E
+	;=> target:
+	;: H G F E D C B A
+	;: h g f e d c b a
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movq mm0, [esi]			; 1st pSrc line	
+	movq mm1, [esi+ecx]		; 2nd pSrc line	
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm2, mm3		; d c D C b a B A
+	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm4, mm5		; h g H G f e F E
+	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5	
+
+	; to handle mm2, mm4
+	movq mm0, mm2		; 
+	punpckldq mm0, mm4 	; H G F E D C B A
+	punpckhdq mm2, mm4 	; h g f e d c b a
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+	pshufw mm1, mm0, 04eh	; 01001110 B	
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movd [edi],	mm0	
+
+	; next unit
+	lea esi, [esi+8]
+	lea edi, [edi+4]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1			; iSrcHeight >> 1	
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
+	
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm4 high bits
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8	
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8	
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8	
+	pavgb xmm3, xmm5
+	
+	packuswb xmm0, xmm1	
+	packuswb xmm2, xmm3	
+	pavgb xmm0, xmm2	
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+	
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1		; iSrcHeight >> 1	
+	movdqa xmm7, [shufb_mask_low]	; mask low	
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movdqa xmm0, [esi]			; 1st_src_line	
+	movdqa xmm1, [esi+ecx]		; 2nd_src_line	
+	
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm2 high bits
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a	
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8	
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1	
+	packuswb xmm0, xmm1	
+
+	; write pDst
+	movq [edi], xmm0	
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1			; iSrcHeight >> 1	
+
+	movdqa xmm7, [shufb_mask_low]	; mask low	
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
+	
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+	
+	packuswb xmm0, xmm1	
+	packuswb xmm2, xmm3	
+	pavgb xmm0, xmm2	
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+	
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1		; iSrcHeight >> 1	
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movntdqa xmm0, [esi]			; 1st_src_line	
+	movntdqa xmm1, [esi+ecx]		; 2nd_src_line	
+	
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1	
+	packuswb xmm0, xmm1	
+
+	; write pDst
+	movq [edi], xmm0	
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+
+
+WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearAccurateDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+	
+	pxor	xmm0,	xmm0
+	mov		edx,	32767
+	mov		eax,	[uiScaleX]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
+	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
+	
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
+	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
+	
+	mov		edx,		40003fffh
+	movd	xmm5,		edx
+	punpcklwd	xmm5,	xmm0					; 16384 16383
+	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
+	
+
+DOWNSAMPLE:
+	
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,			16384
+	mov		[yInverse],		eax
+	
+	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
+	
+HEIGHT:	
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+	
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+	
+	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
+	
+WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	pxor	xmm0,		xmm0
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
+	
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	movdqa	xmm0,	xmm2
+	pmuludq	xmm2,	xmm1
+	psrlq	xmm0,	32
+	psrlq	xmm1,	32
+	pmuludq	xmm0,	xmm1
+	paddq	xmm2,	xmm0
+	pshufd	xmm1,	xmm2,	00001110b
+	paddq	xmm2,	xmm1
+	psrlq	xmm2,	29
+	
+	movd	eax,	xmm2
+	inc		eax
+	shr		eax,	1
+	mov		[edi],	al
+	inc		edi
+	
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+	
+	paddw	xmm3,		xmm7			; inc u
+	psllw	xmm3,		1
+	psrlw	xmm3,		1
+	
+	loop	WIDTH
+
+WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+	
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+	
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+	
+	dec		dword [tmpHeight]
+	jg		HEIGHT
+
+
+LAST_ROW:	
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	
+LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+	
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+	
+	loop	LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
+	
+	
+	
+	
+WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearFastDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+	
+	pxor	xmm0,	xmm0
+	mov		edx,	65535
+	mov		eax,	[uiScaleX]
+	and		eax,	edx
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	65535
+	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 uinc 0 -uinc
+	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
+	
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 vinc 0 -vinc
+	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
+	
+	mov		edx,		80007fffh				; 32768 32767
+	movd	xmm5,		edx					
+	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
+	mov		ebx,		16384
+	
+
+FAST_DOWNSAMPLE:
+	
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,		16384
+	mov		[yInverse],		eax
+	
+	pshuflw	xmm4,		xmm5,	01010000b
+	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
+	
+FAST_HEIGHT:	
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+	
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+	
+	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
+	
+FAST_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+	
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	pmaddwd		xmm2,	xmm1
+	pshufd	xmm1,	xmm2,	00000001b
+	paddd	xmm2,	xmm1
+	movd	xmm1,	ebx
+	paddd	xmm2,	xmm1
+	psrld	xmm2,	15
+	
+	packuswb	xmm2,	xmm0
+	movd	eax,	xmm2
+	mov		[edi],	al
+	inc		edi
+	
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+	
+	paddw	xmm3,		xmm7			; inc u
+	
+	loop	FAST_WIDTH
+
+FAST_WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+	
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+	
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+	
+	dec		dword [tmpHeight]
+	jg		FAST_HEIGHT
+
+
+FAST_LAST_ROW:	
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	
+FAST_LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+	
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+	
+	loop	FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
\ No newline at end of file
--- /dev/null
+++ b/processing/src/asm/intra_pred.asm
@@ -1,0 +1,145 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "../../src/asm/asm_inc.asm"
+
+BITS 32
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+
+
+align 16
+mmx_01bytes:		times 16	db 1
+
+;***********************************************************************
+; macros
+;***********************************************************************
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE 1
+    lea     eax,	[eax+ecx*2]
+    
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx+%1],	xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+%1+0x10],	xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+    
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx],		xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+0x10],	xmm0
+    
+	SSE2_PRED_H_16X16_TWO_LINE   0x20 
+	SSE2_PRED_H_16X16_TWO_LINE   0x40
+	SSE2_PRED_H_16X16_TWO_LINE   0x60
+	SSE2_PRED_H_16X16_TWO_LINE   0x80
+	SSE2_PRED_H_16X16_TWO_LINE   0xa0
+	SSE2_PRED_H_16X16_TWO_LINE   0xc0
+	SSE2_PRED_H_16X16_TWO_LINE   0xe0
+   
+    ret
+    
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+    
+    sub     eax, ecx
+    movdqa  xmm0, [eax]
+    
+    movdqa  [edx], xmm0
+    movdqa  [edx+10h], xmm0
+    movdqa  [edx+20h], xmm0
+    movdqa  [edx+30h], xmm0
+    movdqa  [edx+40h], xmm0
+    movdqa  [edx+50h], xmm0
+    movdqa  [edx+60h], xmm0
+    movdqa  [edx+70h], xmm0
+    movdqa  [edx+80h], xmm0
+    movdqa  [edx+90h], xmm0
+    movdqa  [edx+160], xmm0 
+	movdqa  [edx+176], xmm0
+    movdqa  [edx+192], xmm0
+    movdqa  [edx+208], xmm0
+    movdqa  [edx+224], xmm0
+    movdqa  [edx+240], xmm0
+    
+    ret
\ No newline at end of file
--- /dev/null
+++ b/processing/src/asm/sad.asm
@@ -1,0 +1,216 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  pixel_sse2.asm
+;*
+;*  Abstract
+;*      WelsSampleSad8x8_sse21
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+%macro SAD_8x4 0
+	movq   xmm0,   [eax]
+	movq   xmm1,   [eax+ebx]
+	lea    eax,    [eax+2*ebx]
+	movhps xmm0,   [eax]
+	movhps xmm1,   [eax+ebx]
+
+	movq   xmm2,   [ecx]
+	movq   xmm3,   [ecx+edx]
+	lea    ecx,    [ecx+2*edx]
+	movhps xmm2,   [ecx]
+	movhps xmm3,   [ecx+edx]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+
+  
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+	movq   xmm0,   [eax]
+	movq   xmm1,   [eax+ebx]
+	lea    eax,    [eax+2*ebx]
+	movhps xmm0,   [eax]
+	movhps xmm1,   [eax+ebx]
+
+	movq   xmm2,   [ecx]
+	movq   xmm3,   [ecx+edx]
+	lea    ecx,    [ecx+2*edx]
+	movhps xmm2,   [ecx]
+	movhps xmm3,   [ecx+edx]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+    mov    ecx,    [esp+12]
+	mov    edx,    ecx
+    CACHE_SPLIT_CHECK edx, 8, 64
+	jle    near   .pixel_sad_8x8_nsplit
+	push   ebx
+	push   edi
+	mov    eax,    [esp+12]
+	mov    ebx,    [esp+16]
+    
+    pxor   xmm7,   xmm7
+    
+    mov    edi,    ecx
+    and    edi,    0x07
+    sub    ecx,    edi   
+    mov    edx,    8
+    sub    edx,    edi
+    
+    shl    edi,    3
+    shl    edx,    3
+    movd   xmm5,   edi
+    movd   xmm6,   edx
+	mov    edi,    8
+	add    edi,    ecx
+    mov    edx,    [esp+24]
+    
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+		
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+	
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+	
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	lea    edi,    [edi+2*edx]
+	 
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+		
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+	
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	lea    edi,    [edi+2*edx]
+	 
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+		
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+	
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+	
+	lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+	lea    edi,    [edi+2*edx]
+	 
+    movq   xmm0,   [eax]
+	movhps xmm0,   [eax+ebx]
+		
+	movq   xmm1,   [ecx]
+	movq   xmm2,   [edi]
+	movhps xmm1,   [ecx+edx]
+	movhps xmm2,   [edi+edx]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+	
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+	
+    movhlps    xmm0, xmm7
+	paddw      xmm0, xmm7
+	movd       eax,  xmm0
+	pop        edi
+	jmp        .return
+.pixel_sad_8x8_nsplit:
+    push   ebx
+    mov    eax,    [esp+8]
+	mov    ebx,    [esp+12]
+	mov    edx,    [esp+20]    
+	pxor   xmm6,   xmm6
+	SSE2_GetSad8x4
+    lea    eax,    [eax+2*ebx]
+	lea    ecx,    [ecx+2*edx]
+    SSE2_GetSad8x4    
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       eax,  xmm0
+.return:
+	pop        ebx
+	ret
\ No newline at end of file
--- /dev/null
+++ b/processing/src/asm/vaa.asm
@@ -1,0 +1,1587 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	vaa.asm
+;*
+;*	Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
+;	movdqa %1, %2
+;	punpcklbw %1, %3
+;	punpckhbw %2, %3
+;	paddw %1, %2
+;	pmaddwd %1, %4
+;	pshufd %2, %1, 04Eh	; 01001110 B
+;	paddd %1, %2
+;	pshufd %2, %1, 0B1h	; 10110001 B
+;	paddd %1, %2
+;%endmacro	; END OF SUM_SSE2
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
+	; @sum_8x2 begin
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 0B1h	; 10110001 B
+	paddw %1, %2
+	; end of @sum_8x2
+%endmacro	; END of SUM_WORD_8x2_SSE2
+
+%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
+	movdqa %1, %2
+	punpcklbw %1, %3
+	punpckhbw %2, %3
+	pmaddwd %1, %1
+	pmaddwd %2, %2
+	paddd %1, %2
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddd %1, %2
+	pshufd %2, %1, 0B1h	; 10110001 B
+	paddd %1, %2
+%endmacro	; END OF SUM_SQR_SSE2
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	pshufd %3, %1, 0B1h
+	pshufd %4, %2, 0B1h
+	paddw %1, %3
+	paddw %2, %4
+	movdqa %3, %1
+	movdqa %4, %2
+	pshuflw %5, %1, 0B1h
+	pshufhw %6, %3, 0B1h
+	paddw %1, %5
+	paddw %3, %6
+	pshuflw %5, %2, 0B1h
+	pshufhw %6, %4, 0B1h
+	paddw %2, %5
+	paddw %4, %6
+	punpcklwd %1, %2
+	punpckhwd %3, %4
+	punpcklwd %1, %3
+	psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+	psraw %1, $4
+%endmacro
+
+%macro WELS_SAD_16x2_SSE2  0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	[esi+ebx]
+	movdqa	xmm4,	[edi+ebx]
+	psadbw	xmm1,	xmm2
+	psadbw	xmm3,	xmm4
+	paddd	xmm6,	xmm1
+	paddd	xmm6,	xmm3
+	lea		esi,	[esi+ebx*2]
+	lea		edi,	[edi+ebx*2]	
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm6,	xmm3
+	
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm0
+	paddd	xmm5,	xmm3
+	
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm2
+	
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm7,	xmm3	; sad
+	
+	movdqa	xmm3,	xmm1
+	pmaxub	xmm3,	xmm2
+	pminub	xmm2,	xmm1
+	psubb	xmm3,	xmm2	; diff
+	
+	movdqa	xmm2,	xmm1
+	psadbw	xmm2,	xmm0
+	paddd	xmm6,	xmm2	; sum
+	
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm5,	xmm1
+	paddd		xmm5,	xmm2	; sqsum
+	
+	movdqa		xmm1,	xmm3
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm3	; sqdiff
+	
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
+%define sad_reg			%1
+%define	sum_cur_reg		%2
+%define sum_ref_reg		%3
+%define	mad_reg			%4
+	movdqa	xmm1,		[esi]
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_cur_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	paddd	sum_ref_reg,			xmm3	; sum_ref
+	
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+	
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+	
+	add			esi,		ebx
+	add			edi,		ebx
+%endmacro
+
+
+%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
+%define max_reg  %1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		4
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		2
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		1
+	pmaxub	max_reg,	xmm1
+%endmacro
+
+%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
+%define sad_reg		%1
+%define	sum_reg		%2
+%define mad_reg		%3
+%define sqdiff_reg	%4
+	movdqa		xmm1,		[esi]
+	movdqa		xmm2,		xmm1
+	movdqa		xmm3,		xmm1
+	punpcklbw	xmm2,		xmm0
+	punpckhbw	xmm3,		xmm0
+	pmaddwd		xmm2,		xmm2
+	pmaddwd		xmm3,		xmm3
+	paddd		xmm2,		xmm3
+	movdqa		xmm3,		xmm2
+	psllq		xmm2,		32
+	psrlq		xmm3,		32
+	psllq		xmm3,		32
+	paddd		xmm2,		xmm3
+	paddd		sad_reg,	xmm2		; sqsum
+	
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	pslldq	xmm3,		4
+	paddd	sum_reg,			xmm3	; sum_ref
+	
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+	
+	movdqa	xmm1,		xmm3
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+
+	movdqa		xmm3,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		sqdiff_reg,	xmm1
+	paddd		sqdiff_reg,	xmm3	; sqdiff
+	
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+;	dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN rc_sad_frame_sse2
+;***********************************************************************
+;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
+;***********************************************************************
+ALIGN 16
+rc_sad_frame_sse2:
+	push esi
+	push edi
+	push ebp
+	push ebx
+	push edx
+
+	mov esi, [esp+24]
+	mov edi, [esp+28]
+	mov ebx, [esp+32]
+	mov ecx, [esp+36]
+	mov edx, [esp+40]
+	pxor xmm0, xmm0	
+.hloop:
+	mov eax, ebx
+	mov ebp, $0
+.wloop:
+	movdqa xmm1, [esi+ebp]
+	movdqa xmm2, [edi+ebp]
+	psadbw xmm1, xmm2
+	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
+	paddd xmm1, xmm2
+	paddd xmm0, xmm1	
+	add ebp, 010h
+	dec eax
+	jnz near .wloop
+	lea esi, [esi+edx]
+	lea edi, [edi+edx]
+	dec ecx
+	jnz near .hloop
+
+	movd eax, xmm0
+	pop edx
+	pop ebx
+	pop ebp
+	pop edi
+	pop esi
+	ret
+
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+ALIGN 16
+SampleVariance16x16_sse2:	
+	push esi
+	push edi
+	push ebx
+	
+	sub esp, 16
+	%define SUM			[esp]
+	%define SUM_CUR		[esp+4]
+	%define SQR			[esp+8]
+	%define SQR_CUR		[esp+12]
+	%define PUSH_SIZE	28	; 12 + 16	
+
+	mov edi, [esp+PUSH_SIZE+4]	; y_ref
+	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride	
+	mov esi, [esp+PUSH_SIZE+12]	; y_src
+	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
+	mov ecx, 010h				; height = 16
+
+	pxor xmm7, xmm7
+	movdqu SUM, xmm7
+
+.hloops:
+	movdqa xmm0, [edi]		; y_ref
+	movdqa xmm1, [esi]		; y_src
+	movdqa xmm2, xmm0		; store first for future process
+	movdqa xmm3, xmm1
+	; sum += diff;
+	movdqa xmm4, xmm0
+	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
+	; to be continued for sum
+	pshufd xmm5, xmm4, 0C6h	; 11000110 B
+	paddw xmm4, xmm5
+	movd ebx, xmm4
+	add SUM, ebx
+
+	; sqr += diff * diff;
+	pmaxub xmm0, xmm1
+	pminub xmm1, xmm2
+	psubb xmm0, xmm1				; diff	
+	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
+	movd ebx, xmm1
+	add SQR, ebx
+
+	; sum_cur += y_src[x];
+	movdqa xmm0, xmm3		; cur_orig
+	movdqa xmm1, xmm0
+	punpcklbw xmm0, xmm7
+	punpckhbw xmm1, xmm7
+	paddw xmm0, xmm1		; 8x2
+	SUM_WORD_8x2_SSE2 xmm0, xmm1	
+	movd ebx, xmm0
+	and ebx, 0ffffh
+	add SUM_CUR, ebx
+
+	; sqr_cur += y_src[x] * y_src[x];
+	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
+	movd ebx, xmm0
+	add SQR_CUR, ebx
+	
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
+	dec ecx
+	jnz near .hloops
+	
+	mov ebx, 0
+	mov bx, word SUM
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR
+	sar ecx, 8
+	sub ecx, ebx
+	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
+	mov [edi], cx				; to store uiMotionIndex
+	mov ebx, 0
+	mov bx, word SUM_CUR
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR_CUR
+	sar ecx, 8
+	sub ecx, ebx
+	mov [edi+2], cx				; to store uiTextureIndex
+	
+	%undef SUM
+	%undef SUM_CUR
+	%undef SQR
+	%undef SQR_CUR
+	%undef PUSH_SIZE
+
+	add esp, 16	
+	pop ebx
+	pop edi
+	pop esi	
+
+	ret
+
+; , 6/7/2010
+
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32	
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
+
+	mov ebx, ecx
+	sal ebx, $1			; linesize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; linesize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; linesize x 4 [eax]
+	
+	pxor xmm7, xmm7
+	
+	; loops
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+8], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+24], xmm0
+		
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3
+	
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low word truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+	
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32	
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
+
+	mov ebx, ecx
+	sal ebx, $1			; linesize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; linesize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; linesize x 4 [eax]
+	
+	pxor xmm7, xmm7
+	
+	; loops
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+8], xmm1	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+24], xmm1
+		
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low work truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+	
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+	
+	
+
+WELS_EXTERN abs_difference_mbrow_sse2
+;*************************************************************************************************************
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride, 
+;								 int32_t gom_pixel_num, int32_t *pSum)
+;*************************************************************************************************************
+ALIGN 16
+abs_difference_mbrow_sse2:
+%define		ref_orig			esp + pushsize + 4
+%define		cur_orig			esp + pushsize + 8
+%define		iPicStride			esp + pushsize + 12
+%define		gom_pixel_num		esp + pushsize + 16
+%define		pSum				esp + pushsize + 20
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[ref_orig]
+	mov		edi,	[cur_orig]
+	mov		ebx,	[iPicStride]
+	mov		eax,	[gom_pixel_num]
+	mov		ecx,	16					;MB_WIDTH_LUMA
+	pxor	xmm0,	xmm0
+mb_width_loop_p:
+	mov		edx,	esi
+	add		edx,	eax			; end address
+gom_row_loop_p:
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	psadbw	xmm1,	xmm2
+	paddd	xmm0,	xmm1
+	add		esi,	16
+	add		edi,	16
+	cmp		esi,	edx
+	jl		gom_row_loop_p
+	
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	ebx
+	add		edi,	ebx
+	loop	mb_width_loop_p
+	
+	movdqa	xmm1,	xmm0
+	psrldq	xmm1,	8
+	paddd	xmm1,	xmm0
+	movd	eax,	xmm1
+	mov		edx,	[pSum]	; pSum
+	add		[edx],	eax
+
+%undef		ref_orig
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pushsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+
+
+
+WELS_EXTERN sum_sqrsum_mbrow_sse2
+;*************************************************************************************************************
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride, 
+;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
+;*************************************************************************************************************
+ALIGN 16
+sum_sqrsum_mbrow_sse2:
+%define		cur_orig			esp + pushsize + 4
+%define		iPicStride			esp + pushsize + 8
+%define		gom_pixel_num		esp + pushsize + 12
+%define		pSum				esp + pushsize + 16
+%define		pSqrSum				esp + pushsize + 20
+%define		pushsize			8
+	push		esi
+	push		ebx
+	mov			esi,	[cur_orig]
+	mov			eax,	[gom_pixel_num]
+	mov			ebx,	[iPicStride]
+	mov			ecx,	16					;MB_WIDTH_LUMA
+	pxor		xmm0,	xmm0				; zero
+	pxor		xmm1,	xmm1				; sum
+	pxor		xmm2,	xmm2				; sqr sum
+mb_width_loop_i:
+	mov			edx,	esi
+	add			edx,	eax			; end address
+gom_row_loop_i:
+	movdqa		xmm3,	[esi]
+	movdqa		xmm4,	xmm3
+	psadbw		xmm4,	xmm0
+	paddd		xmm1,	xmm4
+	movdqa		xmm4,	xmm3
+	punpcklbw	xmm4,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm4,	xmm4
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm2,	xmm3
+	paddd		xmm2,	xmm4
+	add			esi,	16
+	cmp			esi,	edx
+	jl			gom_row_loop_i
+	
+	sub			esi,	eax
+	add			esi,	ebx
+	loop		mb_width_loop_i
+	
+	movdqa		xmm3,	xmm1
+	psrldq		xmm3,	8
+	paddd		xmm1,	xmm3
+	movd		eax,	xmm1
+	mov			edx,	[pSum]
+	add			[edx],	eax
+	
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	8
+	paddd		xmm2,	xmm3
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	4
+	paddd		xmm2,	xmm3
+	movd		eax,	xmm2
+	mov			edx,	[pSqrSum]
+	add			[edx],	eax
+
+
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pSqrSum
+%undef		pushsize	
+	pop			ebx
+	pop			esi
+	ret
+
+
+
+WELS_EXTERN VAACalcSad_sse2
+;*************************************************************************************************************
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSad_sse2:
+%define		cur_data			esp + pushsize + 4
+%define		ref_data			esp + pushsize + 8
+%define		iPicWidth			esp + pushsize + 12
+%define		iPicHeight			esp + pushsize + 16
+%define		iPicStride			esp + pushsize + 20
+%define		psadframe			esp + pushsize + 24
+%define		psad8x8				esp + pushsize + 28
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4								; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+height_loop:
+	mov		ecx,	dword [iPicWidth]
+	push	esi
+	push	edi
+width_loop:
+	pxor	xmm6,	xmm6		;
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+	
+	pxor	xmm6,	xmm6
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	dec		ecx
+	jnz		width_loop
+	
+	pop		edi
+	pop		esi
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec	dword [iPicHeight]
+	jnz		height_loop
+	
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		pushsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+	
+	
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 
+;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define		localsize		8
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+var_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+var_width_loop:
+	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
+	pxor	xmm5,	xmm5		; pSum16x16
+	pxor	xmm4,	xmm4		; sqsum_16x16
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+	
+	pxor	xmm6,	xmm6
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+	
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm5
+	psrldq	xmm1,	8
+	paddd	xmm5,	xmm1
+	movd	[ebp],	xmm5
+	add		dword [psum16x16], 4
+	
+	movdqa	xmm5,	xmm4
+	psrldq	xmm5,	8
+	paddd	xmm4,	xmm5
+	movdqa	xmm3,	xmm4
+	psrldq	xmm3,	4
+	paddd	xmm4,	xmm3
+	
+	mov		ebp,	[psqsum16x16]
+	movd	[ebp],	xmm4
+	add		dword [psqsum16x16], 4
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	dec		ecx
+	jnz		var_width_loop
+	
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec	dword [iPicHeight]
+	jnz		var_height_loop
+	
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+	add		esp,	localsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+	
+	
+
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,  
+;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_width_loop:
+	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
+	pxor	xmm6,	xmm6		; pSum16x16
+	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx],		xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+4],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+	
+	pxor	xmm7,	xmm7
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx+8],	xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+12],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+	
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm6
+	psrldq	xmm1,	8
+	paddd	xmm6,	xmm1
+	movd	[ebp],	xmm6
+	add		dword [psum16x16], 4
+	
+	mov		ebp,	[psqsum16x16]
+	pshufd	xmm6,	xmm5,	14 ;00001110
+	paddd	xmm6,	xmm5
+	pshufd	xmm5,	xmm6,	1  ;00000001
+	paddd	xmm5,	xmm6
+	movd	[ebp],	xmm5
+	add		dword [psqsum16x16], 4
+	
+	mov		ebp,	[psqdiff16x16]
+	pshufd	xmm5,	xmm4,	14	; 00001110
+	paddd	xmm5,	xmm4
+	pshufd	xmm4,	xmm5,	1	; 00000001
+	paddd	xmm4,	xmm5
+	movd	[ebp],	xmm4
+	add		dword	[psqdiff16x16],	4
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	dec		ecx
+	jnz		sqdiff_width_loop
+	
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec	dword [iPicHeight]
+	jnz		sqdiff_height_loop
+	
+	mov		ebx,	[tmp_sadframe]
+	mov		eax,	[psadframe]
+	mov		[eax],	ebx
+
+	add		esp,	localsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		tmp_sadframe
+%undef		pushsize
+%undef		localsize
+	ret
+	
+	
+	
+	
+
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
+;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		p_sd8x8				esp + pushsize + localsize + 32
+%define		p_mad8x8			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_ecx				esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	xor		ebp,	ebp
+	pxor	xmm0,	xmm0
+bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8
+	pxor	xmm6,	xmm6		; sum_cur_8x8
+	pxor	xmm5,	xmm5		; sum_ref_8x8
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	
+	
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+	
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx	
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	
+	pslldq		xmm7,	4
+	pslldq		xmm6,	4
+	pslldq		xmm5,	4
+	
+	
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+	
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx	
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+	
+	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+	
+	mov		edx,	[psad8x8]
+	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
+	movdqa	[edx],	xmm1					
+	add		edx,	16
+	mov		[psad8x8],	edx					; sad8x8
+	
+	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
+	pshufd	xmm2,	xmm1,	00000011b
+	paddd	xmm1,	xmm2
+	movd	edx,	xmm1
+	add		ebp,	edx						; sad frame
+	
+	mov		edx,	[p_sd8x8]
+	psubd	xmm6,	xmm5
+	pshufd	xmm1,	xmm6,	10001101b
+	movdqa	[edx],	xmm1
+	add		edx,	16
+	mov		[p_sd8x8],	edx
+	
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		bgd_width_loop
+	
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec		dword [iPicHeight]
+	jnz		bgd_height_loop
+	
+	mov		edx,	[psadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
+;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 
+;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define		localsize		16
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		p_sd8x8				esp + pushsize + localsize + 44
+%define		p_mad8x8			esp + pushsize + localsize + 48
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		tmp_ecx				esp + 12
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+	
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+	
+	mov		edx,		[psum16x16]
+	movdqa	xmm1,		xmm6
+	pshufd	xmm2,		xmm1,		00001110b
+	paddd	xmm1,		xmm2
+	movd	[edx],		xmm1				; sum
+	
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+	
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+	
+	psrlq	xmm7,	32
+	psllq	xmm7,	32			; clear sad
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+	
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+	
+	mov		edx,			[psum16x16]
+	movdqa	xmm1,			xmm6
+	pshufd	xmm2,			xmm1,		00001110b
+	paddd	xmm1,			xmm2
+	movd	ebp,			xmm1				; sum
+	add		[edx],			ebp
+	add		edx,			4
+	mov		[psum16x16],	edx
+	
+	mov		edx,			[psqsum16x16]
+	psrlq	xmm7,			32
+	pshufd	xmm2,			xmm7,		00001110b
+	paddd	xmm2,			xmm7
+	movd	[edx],			xmm2				; sqsum
+	add		edx,			4
+	mov		[psqsum16x16],	edx
+	
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+	
+	mov		edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx	
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+	
+	mov		edx,		[psqdiff16x16]
+	pshufd	xmm1,		xmm4,		00001110b
+	paddd	xmm4,		xmm1
+	pshufd	xmm1,		xmm4,		00000001b
+	paddd	xmm4,		xmm1
+	movd	[edx],		xmm4
+	add		edx,		4
+	mov		[psqdiff16x16],	edx
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		sqdiff_bgd_width_loop
+	
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec	dword [iPicHeight]
+	jnz		sqdiff_bgd_height_loop
+	
+	mov		edx,	[psadframe]
+	mov		ebp,	[tmp_sadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
\ No newline at end of file
--- /dev/null
+++ b/processing/src/backgounddetection/BackgroundDetection.cpp
@@ -1,0 +1,419 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "BackgroundDetection.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define LOG2_BGD_OU_SIZE    (4)
+#define LOG2_BGD_OU_SIZE_UV (LOG2_BGD_OU_SIZE-1)
+#define BGD_OU_SIZE         (1<<LOG2_BGD_OU_SIZE)
+#define BGD_OU_SIZE_UV      (BGD_OU_SIZE>>1)
+#define BGD_THD_SAD         (2*BGD_OU_SIZE*BGD_OU_SIZE)
+#define	BGD_THD_ASD_UV      (4*BGD_OU_SIZE_UV)
+#define LOG2_MB_SIZE        (4)
+#define OU_SIZE_IN_MB       (BGD_OU_SIZE >> 4)
+#define Q_FACTOR            (8)
+#define BGD_DELTA_QP_THD    (3)
+
+#define OU_LEFT		(0x01)
+#define OU_RIGHT	(0x02)
+#define OU_TOP		(0x04)
+#define OU_BOTTOM	(0x08)
+
+CBackgroundDetection::CBackgroundDetection(int32_t iCpuFlag)
+{
+	m_eMethod = METHOD_BACKGROUND_DETECTION;
+	WelsMemset(&m_BgdParam, 0, sizeof(m_BgdParam));
+	m_iLargestFrameSize = 0;
+}
+
+CBackgroundDetection::~CBackgroundDetection()
+{
+	FreeOUArrayMemory();
+}
+
+EResult CBackgroundDetection::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
+{
+	EResult eReturn = RET_INVALIDPARAM;	
+
+	if (pSrcPixMap==NULL || pRefPixMap==NULL)
+		return eReturn;
+
+	m_BgdParam.pCur[0] = (uint8_t *)pSrcPixMap->pPixel[0];
+	m_BgdParam.pCur[1] = (uint8_t *)pSrcPixMap->pPixel[1];
+	m_BgdParam.pCur[2] = (uint8_t *)pSrcPixMap->pPixel[2];
+	m_BgdParam.pRef[0] = (uint8_t *)pRefPixMap->pPixel[0];
+	m_BgdParam.pRef[1] = (uint8_t *)pRefPixMap->pPixel[1];
+	m_BgdParam.pRef[2] = (uint8_t *)pRefPixMap->pPixel[2];
+	m_BgdParam.iBgdWidth = pSrcPixMap->sRect.iRectWidth;
+	m_BgdParam.iBgdHeight = pSrcPixMap->sRect.iRectHeight;
+	m_BgdParam.iStride[0] = pSrcPixMap->iStride[0];
+	m_BgdParam.iStride[1] = pSrcPixMap->iStride[1];
+	m_BgdParam.iStride[2] = pSrcPixMap->iStride[2];
+
+	int32_t iCurFrameSize = m_BgdParam.iBgdWidth * m_BgdParam.iBgdHeight;
+	if (m_BgdParam.pOU_array == NULL || iCurFrameSize > m_iLargestFrameSize)
+	{
+		FreeOUArrayMemory();
+		m_BgdParam.pOU_array = AllocateOUArrayMemory(m_BgdParam.iBgdWidth, m_BgdParam.iBgdHeight);
+		m_iLargestFrameSize = iCurFrameSize;
+	} 
+
+	if (m_BgdParam.pOU_array == NULL)
+		return eReturn;
+
+	BackgroundDetection(&m_BgdParam);
+
+	return RET_SUCCESS;
+}
+
+EResult CBackgroundDetection::Set(int32_t iType, void *pParam)
+{
+	if (pParam == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	SBGDInterface *pInterface = (SBGDInterface *)pParam;
+
+	m_BgdParam.pBackgroundMbFlag = (int8_t *)pInterface->pBackgroundMbFlag;
+	m_BgdParam.pCalcRes = pInterface->pCalcRes;
+
+	return RET_SUCCESS;
+}
+
+inline SBackgroundOU* CBackgroundDetection::AllocateOUArrayMemory(int32_t iWidth, int32_t iHeight)
+{
+	int32_t	iMaxOUWidth	= (BGD_OU_SIZE-1+iWidth)>>LOG2_BGD_OU_SIZE;
+	int32_t	iMaxOUHeight	= (BGD_OU_SIZE-1+iHeight)>>LOG2_BGD_OU_SIZE;
+	return (SBackgroundOU *)WelsMalloc( iMaxOUWidth * iMaxOUHeight * sizeof(SBackgroundOU) );
+}
+
+inline void CBackgroundDetection::FreeOUArrayMemory()
+{
+	_SafeFree(m_BgdParam.pOU_array);
+}
+
+void CBackgroundDetection::GetOUParameters( SVAACalcResult *sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth, SBackgroundOU* pBgdOU)
+{
+	int32_t	iSubSD[4];
+	uint8_t	iSubMAD[4];
+	int32_t	iSubSAD[4];
+
+	uint8_t (*pMad8x8)[4];
+	int32_t (*pSad8x8)[4];
+	int32_t (*pSd8x8)[4];
+
+	pSad8x8 = sVaaCalcInfo->pSad8x8;
+	pMad8x8 = sVaaCalcInfo->pMad8x8;
+	pSd8x8  = sVaaCalcInfo->pSumOfDiff8x8;
+
+	iSubSAD[0] = pSad8x8[iMbIndex][0];
+	iSubSAD[1] = pSad8x8[iMbIndex][1];
+	iSubSAD[2] = pSad8x8[iMbIndex][2];
+	iSubSAD[3] = pSad8x8[iMbIndex][3];
+
+	iSubSD[0] = pSd8x8[iMbIndex][0];
+	iSubSD[1] = pSd8x8[iMbIndex][1];
+	iSubSD[2] = pSd8x8[iMbIndex][2];
+	iSubSD[3] = pSd8x8[iMbIndex][3];
+
+	iSubMAD[0] = pMad8x8[iMbIndex][0];
+	iSubMAD[1] = pMad8x8[iMbIndex][1];
+	iSubMAD[2] = pMad8x8[iMbIndex][2];
+	iSubMAD[3] = pMad8x8[iMbIndex][3];
+
+	pBgdOU->iSD	= iSubSD[0] + iSubSD[1] + iSubSD[2] + iSubSD[3];
+	pBgdOU->iSAD	= iSubSAD[0] + iSubSAD[1] + iSubSAD[2] + iSubSAD[3];
+	pBgdOU->iSD	= WELS_ABS(pBgdOU->iSD);
+
+	// get the max absolute difference (MAD) of OU and min value of the MAD of sub-blocks of OU
+	pBgdOU->iMAD = WELS_MAX(WELS_MAX(iSubMAD[0],iSubMAD[1]), WELS_MAX(iSubMAD[2],iSubMAD[3]));
+	pBgdOU->iMinSubMad = WELS_MIN(WELS_MIN(iSubMAD[0],iSubMAD[1]), WELS_MIN(iSubMAD[2],iSubMAD[3]));
+
+	// get difference between the max and min SD of the SDs of sub-blocks of OU
+	pBgdOU->iMaxDiffSubSd = WELS_MAX(WELS_MAX(iSubSD[0],iSubSD[1]), WELS_MAX(iSubSD[2],iSubSD[3])) -
+		WELS_MIN(WELS_MIN(iSubSD[0],iSubSD[1]), WELS_MIN(iSubSD[2],iSubSD[3]));
+}
+
+void CBackgroundDetection::ForegroundBackgroundDivision(vBGDParam *pBgdParam)
+{
+	int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
+	int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
+	int32_t iPicWidthInMb	= (15+pBgdParam->iBgdWidth)>>4;
+
+	SBackgroundOU *pBackgroundOU = pBgdParam->pOU_array;
+
+	for (int32_t j = 0; j < iPicHeightInOU; j ++ ) 
+	{
+		for (int32_t i = 0; i < iPicWidthInOU; i++ )
+		{
+			GetOUParameters( pBgdParam->pCalcRes, (j*iPicWidthInMb+i)<<(LOG2_BGD_OU_SIZE-LOG2_MB_SIZE), iPicWidthInMb, pBackgroundOU);
+
+			pBackgroundOU->iBackgroundFlag = 0;
+			if (pBackgroundOU->iMAD>63)
+			{
+				pBackgroundOU++;
+				continue;
+			}
+			if ((pBackgroundOU->iMaxDiffSubSd<=pBackgroundOU->iSAD>>3 || pBackgroundOU->iMaxDiffSubSd<=(BGD_OU_SIZE*Q_FACTOR)) && pBackgroundOU->iSAD < (BGD_THD_SAD<<1)) //BGD_OU_SIZE*BGD_OU_SIZE>>2
+			{
+				if (pBackgroundOU->iSAD<=BGD_OU_SIZE*Q_FACTOR)
+				{
+					pBackgroundOU->iBackgroundFlag = 1;
+				} 
+				else
+				{
+					pBackgroundOU->iBackgroundFlag = pBackgroundOU->iSAD < BGD_THD_SAD ?
+						(pBackgroundOU->iSD < (pBackgroundOU->iSAD*3)>>2) : 
+					(pBackgroundOU->iSD<<1 < pBackgroundOU->iSAD);
+				}
+			}
+			pBackgroundOU++;
+		}
+	}
+}
+inline int32_t CBackgroundDetection::CalculateAsdChromaEdge( uint8_t *pOriRef, uint8_t *pOriCur, int32_t iStride )
+{
+	int32_t	ASD = 0;
+	int32_t	idx;
+	for( idx = 0; idx < BGD_OU_SIZE_UV; idx++ )
+	{
+		ASD += *pOriCur - *pOriRef;
+		pOriRef += iStride;
+		pOriCur += iStride;
+	}
+	return WELS_ABS(ASD);
+}
+
+inline bool_t CBackgroundDetection::ForegroundDilation23Luma(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[])
+{
+	SBackgroundOU *pOU_L	= pOUNeighbours[0];
+	SBackgroundOU *pOU_R	= pOUNeighbours[1];
+	SBackgroundOU *pOU_U	= pOUNeighbours[2];
+	SBackgroundOU *pOU_D	= pOUNeighbours[3];
+
+	if (pBackgroundOU->iMAD > pBackgroundOU->iMinSubMad<<1)
+	{
+		int32_t iMaxNbrForegroundMad;
+		int32_t iMaxNbrBackgroundMad;
+		int32_t	aBackgroundMad[4];
+		int32_t	aForegroundMad[4];
+
+		aForegroundMad[0] = (pOU_L->iBackgroundFlag - 1) & pOU_L->iMAD;
+		aForegroundMad[1] = (pOU_R->iBackgroundFlag - 1) & pOU_R->iMAD;
+		aForegroundMad[2] = (pOU_U->iBackgroundFlag - 1) & pOU_U->iMAD;
+		aForegroundMad[3] = (pOU_D->iBackgroundFlag - 1) & pOU_D->iMAD;
+		iMaxNbrForegroundMad = WELS_MAX(WELS_MAX(aForegroundMad[0],aForegroundMad[1]), WELS_MAX(aForegroundMad[2],aForegroundMad[3]));
+
+		aBackgroundMad[0] = ((!pOU_L->iBackgroundFlag) - 1) & pOU_L->iMAD;
+		aBackgroundMad[1] = ((!pOU_R->iBackgroundFlag) - 1) & pOU_R->iMAD;
+		aBackgroundMad[2] = ((!pOU_U->iBackgroundFlag) - 1) & pOU_U->iMAD;
+		aBackgroundMad[3] = ((!pOU_D->iBackgroundFlag) - 1) & pOU_D->iMAD;
+		iMaxNbrBackgroundMad = WELS_MAX(WELS_MAX(aBackgroundMad[0],aBackgroundMad[1]), WELS_MAX(aBackgroundMad[2],aBackgroundMad[3]));
+
+		return ((iMaxNbrForegroundMad > pBackgroundOU->iMinSubMad<<2) || (pBackgroundOU->iMAD > iMaxNbrBackgroundMad<<1 && pBackgroundOU->iMAD <= (iMaxNbrForegroundMad*3)>>1));
+	}
+	return 0;
+}
+
+inline bool_t CBackgroundDetection::ForegroundDilation23Chroma(int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam *pBgdParam)
+{
+	static const int8_t kaOUPos[4]	= {OU_LEFT, OU_RIGHT, OU_TOP, OU_BOTTOM};
+	int32_t	aEdgeOffset[4]	= {0, BGD_OU_SIZE_UV-1, 0, iPicStrideUV*(BGD_OU_SIZE_UV-1)};
+	int32_t	iStride[4]		= {iPicStrideUV, iPicStrideUV, 1, 1};
+
+	// V component first, high probability because V stands for red color and human skin colors have more weight on this component
+	for (int32_t i=0;i<4;i++)
+	{
+		if (iNeighbourForegroundFlags & kaOUPos[i])
+		{
+			uint8_t *pRefC = pBgdParam->pRef[2] + iStartSamplePos + aEdgeOffset[i];
+			uint8_t *pCurC = pBgdParam->pCur[2] + iStartSamplePos + aEdgeOffset[i];
+			if (CalculateAsdChromaEdge(pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV)
+			{
+				return 1;
+			}
+		}
+	}
+	// U component, which stands for blue color, low probability
+	for (int32_t i=0;i<4;i++)
+	{
+		if (iNeighbourForegroundFlags & kaOUPos[i])
+		{
+			uint8_t *pRefC = pBgdParam->pRef[1] + iStartSamplePos + aEdgeOffset[i];
+			uint8_t *pCurC = pBgdParam->pCur[1] + iStartSamplePos + aEdgeOffset[i];
+			if (CalculateAsdChromaEdge(pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV)
+			{
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+inline void CBackgroundDetection::ForegroundDilation(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[], vBGDParam *pBgdParam, int32_t	iChromaSampleStartPos)
+{
+	int32_t iPicStrideUV	= pBgdParam->iStride[1];
+	int32_t iSumNeighBackgroundFlags	= pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag + pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
+
+	if (pBackgroundOU->iSAD>BGD_OU_SIZE*Q_FACTOR)
+	{
+		switch (iSumNeighBackgroundFlags)
+		{
+		case 0:
+		case 1:
+			pBackgroundOU->iBackgroundFlag = 0;
+			break;
+		case 2:
+		case 3:
+			pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma(pBackgroundOU, pOUNeighbours);
+
+			// chroma component check
+			if (pBackgroundOU->iBackgroundFlag==1)
+			{
+				int8_t	iNeighbourForegroundFlags = !pOUNeighbours[0]->iBackgroundFlag | ((!pOUNeighbours[1]->iBackgroundFlag)<<1)
+					| ((!pOUNeighbours[2]->iBackgroundFlag)<<2) | ((!pOUNeighbours[3]->iBackgroundFlag)<<3);
+				pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Chroma(iNeighbourForegroundFlags, iChromaSampleStartPos, iPicStrideUV, pBgdParam);
+			}
+			break;
+		default:
+			break;
+		}
+	}
+}
+inline void CBackgroundDetection::BackgroundErosion(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[])
+{
+	if (pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE*Q_FACTOR)) //BGD_OU_SIZE*BGD_OU_SIZE>>2
+	{
+		int32_t	iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag + pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
+		int32_t	sumNbrBGsad = (pOUNeighbours[0]->iSAD&(-pOUNeighbours[0]->iBackgroundFlag)) + (pOUNeighbours[2]->iSAD&(-pOUNeighbours[2]->iBackgroundFlag))
+			+ (pOUNeighbours[1]->iSAD&(-pOUNeighbours[1]->iBackgroundFlag)) + (pOUNeighbours[3]->iSAD&(-pOUNeighbours[3]->iBackgroundFlag));
+		if (pBackgroundOU->iSAD*iSumNeighBackgroundFlags <= (3*sumNbrBGsad)>>1)
+		{
+			if (iSumNeighBackgroundFlags==4)
+			{
+				pBackgroundOU->iBackgroundFlag = 1;
+			} 
+			else
+			{
+				if ((pOUNeighbours[0]->iBackgroundFlag & pOUNeighbours[1]->iBackgroundFlag) || (pOUNeighbours[2]->iBackgroundFlag & pOUNeighbours[3]->iBackgroundFlag))
+				{
+					pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma(pBackgroundOU, pOUNeighbours);
+				}
+			}
+		}
+	}
+}
+
+inline void CBackgroundDetection::SetBackgroundMbFlag(int8_t *pBackgroundMbFlag,int32_t iPicWidthInMb, int32_t iBackgroundMbFlag)
+{
+	*pBackgroundMbFlag = iBackgroundMbFlag;
+}
+
+inline void CBackgroundDetection::UpperOUForegroundCheck(SBackgroundOU *pCurOU, int8_t *pBackgroundMbFlag, int32_t iPicWidthInOU, int32_t iPicWidthInMb)
+{
+	if (pCurOU->iSAD > BGD_OU_SIZE*Q_FACTOR)
+	{
+		SBackgroundOU	*pOU_L = pCurOU-1;
+		SBackgroundOU	*pOU_R = pCurOU+1;
+		SBackgroundOU	*pOU_U = pCurOU-iPicWidthInOU;
+		SBackgroundOU	*pOU_D = pCurOU+iPicWidthInOU; 
+		if (pOU_L->iBackgroundFlag + pOU_R->iBackgroundFlag + pOU_U->iBackgroundFlag + pOU_D->iBackgroundFlag <= 1)
+		{
+			SetBackgroundMbFlag(pBackgroundMbFlag,iPicWidthInMb,0);
+			pCurOU->iBackgroundFlag = 0;
+		}
+	}
+}
+
+void CBackgroundDetection::ForegroundDilationAndBackgroundErosion(vBGDParam *pBgdParam)
+{
+	int32_t iPicStrideUV		= pBgdParam->iStride[1];
+	int32_t iPicWidthInOU	= pBgdParam->iBgdWidth  >> LOG2_BGD_OU_SIZE;
+	int32_t iPicHeightInOU	= pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
+	int32_t iOUStrideUV		= iPicStrideUV << (LOG2_BGD_OU_SIZE-1);
+	int32_t iPicWidthInMb	= (15+pBgdParam->iBgdWidth)>>4;
+
+	SBackgroundOU *pBackgroundOU= pBgdParam->pOU_array;
+	int8_t	*pVaaBackgroundMbFlag   = (int8_t *)pBgdParam->pBackgroundMbFlag;
+	SBackgroundOU	*pOUNeighbours[4];//0: left; 1: right; 2: top; 3: bottom
+
+	pBackgroundOU	= pBgdParam->pOU_array;
+	pOUNeighbours[2]	= pBackgroundOU;//top OU
+	for (int32_t j = 0; j < iPicHeightInOU; j ++ )
+	{
+		int8_t *pRowSkipFlag = pVaaBackgroundMbFlag;
+		pOUNeighbours[0]	= pBackgroundOU;//left OU
+		pOUNeighbours[3]	= pBackgroundOU + (iPicWidthInOU & ((j == iPicHeightInOU-1) - 1));//bottom OU
+		for (int32_t i = 0; i < iPicWidthInOU; i++ )
+		{
+			pOUNeighbours[1] = pBackgroundOU + (i < iPicWidthInOU-1);//right OU
+
+			if (pBackgroundOU->iBackgroundFlag)
+				ForegroundDilation(pBackgroundOU, pOUNeighbours, pBgdParam, j*iOUStrideUV+(i<<LOG2_BGD_OU_SIZE_UV));
+			else 
+				BackgroundErosion(pBackgroundOU, pOUNeighbours);
+
+			// check the up OU
+			if (j>1 && i>0 && i<iPicWidthInOU-1 && pOUNeighbours[2]->iBackgroundFlag==1)
+			{
+				UpperOUForegroundCheck(pOUNeighbours[2], pRowSkipFlag-OU_SIZE_IN_MB*iPicWidthInMb, iPicWidthInOU, iPicWidthInMb);
+			}
+
+			SetBackgroundMbFlag(pRowSkipFlag,iPicWidthInMb,pBackgroundOU->iBackgroundFlag);
+
+			// preparation for the next OU
+			pRowSkipFlag += OU_SIZE_IN_MB;
+			pOUNeighbours[0] = pBackgroundOU;
+			pOUNeighbours[2]++;
+			pOUNeighbours[3]++;
+			pBackgroundOU++;
+		}
+		pOUNeighbours[2]	= pBackgroundOU - iPicWidthInOU;
+		pVaaBackgroundMbFlag += OU_SIZE_IN_MB*iPicWidthInMb;
+	}
+}
+
+void CBackgroundDetection::BackgroundDetection( vBGDParam *pBgdParam )
+{
+	// 1st step: foreground/background coarse division
+	ForegroundBackgroundDivision(pBgdParam);
+
+	// 2nd step: foreground dilation and background erosion
+	ForegroundDilationAndBackgroundErosion(pBgdParam);
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/backgounddetection/BackgroundDetection.h
@@ -1,0 +1,104 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	       :  BackgroundDetection.h
+ *
+ * \brief	     :  background detection class of wels video processor class
+ *
+ * \date        :  2011/03/17
+ *
+ * \description :  1. rewrite the package code of background detection class  
+ *
+ */
+
+#ifndef _WELSVP_BACKGROUNDDETECTION_H
+#define _WELSVP_BACKGROUNDDETECTION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef struct
+{
+	int32_t	iBackgroundFlag;
+	int32_t	iSAD;
+	int32_t	iSD;
+	int32_t	iMAD;			
+	int32_t	iMinSubMad;		
+	int32_t	iMaxDiffSubSd;	
+} SBackgroundOU;
+
+class CBackgroundDetection : public IStrategy
+{			  
+public:
+	CBackgroundDetection(int32_t iCpuFlag);
+	~CBackgroundDetection();
+
+	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pRef);
+	EResult Set    (int32_t iType, void *pParam); 
+
+private:
+	struct vBGDParam
+	{
+		uint8_t   *pCur[3];
+		uint8_t   *pRef[3];
+		int32_t	   iBgdWidth;			
+		int32_t	   iBgdHeight;			
+		int32_t    iStride[3];
+		SBackgroundOU	  *pOU_array;
+		int8_t	  *pBackgroundMbFlag;
+		SVAACalcResult  *pCalcRes;
+	}m_BgdParam;
+
+	int32_t     m_iLargestFrameSize;
+
+private:
+	inline SBackgroundOU* AllocateOUArrayMemory(int32_t iWidth, int32_t iHeight);
+	inline void     FreeOUArrayMemory();
+	inline int32_t  CalculateAsdChromaEdge( uint8_t *pOriRef, uint8_t *pOriCur, int32_t iStride );
+	inline bool_t   ForegroundDilation23Luma(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[]);//Foreground_Dilation_2_3_Luma
+	inline bool_t   ForegroundDilation23Chroma(int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam *pBgdParam);//Foreground_Dilation_2_3_Chroma
+	inline void     ForegroundDilation(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[], vBGDParam *pBgdParam, int32_t	iChromaSampleStartPos);
+	inline void     BackgroundErosion(SBackgroundOU *pBackgroundOU, SBackgroundOU *pOUNeighbours[]);
+	inline void     SetBackgroundMbFlag(int8_t *pBackgroundMbFlag,int32_t iPicWidthInMb, int32_t iBackgroundMbFlag);
+	inline void     UpperOUForegroundCheck(SBackgroundOU *pCurOU, int8_t *pBackgroundMbFlag, int32_t iPicWidthInOU, int32_t iPicWidthInMb);
+
+	void    GetOUParameters( SVAACalcResult *sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth, SBackgroundOU* pBackgroundOU);
+	void    ForegroundBackgroundDivision(vBGDParam *pBgdParam);
+	void    ForegroundDilationAndBackgroundErosion(vBGDParam *pBgdParam);
+	void    BackgroundDetection( vBGDParam *pBgdParam );
+};	
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/common/WelsFrameWork.cpp
@@ -1,0 +1,318 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "WelsFrameWork.h"
+#include "cpu.h"
+#include "../denoise/denoise.h"
+#include "../downsample/downsample.h"
+#include "../scenechangedetection/SceneChangeDetection.h"
+#include "../vaacalc/vaacalculation.h"
+#include "../backgounddetection/BackgroundDetection.h"
+#include "../adaptivequantization/AdaptiveQuantization.h"
+#include "../complexityanalysis/ComplexityAnalysis.h"
+#include "../imagerotate/imagerotate.h"
+
+
+/* interface API implement */
+
+EResult WELSAPI CreateVpInterface  (void **ppCtx, int iVersion)
+{
+	if (iVersion & 0x8000)
+		return nsWelsVP::CreateSpecificVpInterface((IWelsVP **)ppCtx);
+	else if (iVersion & 0x7fff)
+		return nsWelsVP::CreateSpecificVpInterface((IWelsVPc **)ppCtx);
+	else
+		return RET_INVALIDPARAM;
+}
+
+EResult WELSAPI DestroyVpInterface  (void *pCtx, int iVersion)
+{
+	if (iVersion & 0x8000)
+		return nsWelsVP::DestroySpecificVpInterface((IWelsVP *)pCtx);
+	else if (iVersion & 0x7fff)
+		return nsWelsVP::DestroySpecificVpInterface((IWelsVPc *)pCtx);
+	else
+		return RET_INVALIDPARAM;
+}
+
+WELSVP_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////
+
+EResult CreateSpecificVpInterface(IWelsVP **ppCtx)
+{
+	EResult  eReturn = RET_FAILED;
+
+	CVpFrameWork *pFr = new CVpFrameWork(1, eReturn);  
+	if (pFr)
+	{
+		*ppCtx  = (IWelsVP *)pFr;
+		eReturn = RET_SUCCESS;
+	}
+
+	return eReturn;
+}
+
+EResult DestroySpecificVpInterface  (IWelsVP *pCtx)
+{
+	_SafeDelete(pCtx);
+
+	return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+CVpFrameWork::CVpFrameWork(uint32_t uiThreadsNum, EResult &eReturn)
+{
+	int32_t iCoreNum = 0;
+	uint32_t uiCPUFlag = WelsCPUFeatureDetect(&iCoreNum);
+
+	for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++)
+	{
+		IStrategy *pStrategy = m_pStgChain[i];
+		pStrategy = CreateStrategy(WelsStaticCast(EMethods, i + 1), uiCPUFlag);
+		m_pStgChain[i] = pStrategy;	
+	}
+	
+	WelsMutexInit(&m_mutes);
+
+	eReturn = RET_SUCCESS;	
+}
+
+CVpFrameWork::~CVpFrameWork()
+{
+	for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++)
+	{
+		if (m_pStgChain[i])
+		{
+			Uninit(m_pStgChain[i]->m_eMethod);
+			_SafeDelete(m_pStgChain[i]);
+		}		
+	}
+ 
+	WelsMutexDestroy(&m_mutes);
+}
+
+EResult CVpFrameWork::Init(int32_t iType, void *pCfg)
+{
+	EResult eReturn   = RET_SUCCESS;
+	int32_t iCurIdx    = WelsStaticCast(int32_t, WelsVpGetValidMethod(iType)) - 1;	
+
+	Uninit(iType);
+
+	WelsMutexLock(&m_mutes);
+
+	IStrategy *pStrategy = m_pStgChain[iCurIdx];
+	if (pStrategy)
+		eReturn = pStrategy->Init(0, pCfg);
+
+	WelsMutexUnlock(&m_mutes);
+
+	return eReturn;
+}
+
+EResult CVpFrameWork::Uninit(int32_t iType)
+{
+	EResult eReturn        = RET_SUCCESS;
+	int32_t iCurIdx    = WelsStaticCast(int32_t, WelsVpGetValidMethod(iType)) - 1;
+
+	WelsMutexLock(&m_mutes);
+
+	IStrategy *pStrategy = m_pStgChain[iCurIdx];
+	if (pStrategy)
+		eReturn = pStrategy->Uninit(0);
+
+	WelsMutexUnlock(&m_mutes);
+
+	return eReturn;
+}
+
+EResult CVpFrameWork::Flush(int32_t iType)
+{
+	EResult eReturn        = RET_SUCCESS;
+
+	return eReturn;
+}
+
+EResult CVpFrameWork::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pDstPixMap)
+{
+	EResult eReturn        = RET_NOTSUPPORTED;
+	EMethods eMethod    = WelsVpGetValidMethod(iType);
+	int32_t iCurIdx    = WelsStaticCast(int32_t, eMethod) - 1;
+	SPixMap sSrcPic;
+	SPixMap sDstPic;
+    memset(&sSrcPic, 0, sizeof(sSrcPic));// confirmed_safe_unsafe_usage
+    memset(&sDstPic, 0, sizeof(sDstPic));// confirmed_safe_unsafe_usage
+
+	if (pSrcPixMap) sSrcPic = *pSrcPixMap;
+	if (pDstPixMap) sDstPic = *pDstPixMap;
+	if (!CheckValid(eMethod, sSrcPic, sDstPic))
+		return RET_INVALIDPARAM;
+
+	WelsMutexLock(&m_mutes);
+
+	IStrategy *pStrategy = m_pStgChain[iCurIdx];
+	if (pStrategy)
+		eReturn = pStrategy->Process(0, &sSrcPic, &sDstPic);
+
+	WelsMutexUnlock(&m_mutes);
+
+	return eReturn;
+}
+
+EResult CVpFrameWork::Get(int32_t iType, void *pParam)
+{
+	EResult eReturn        = RET_SUCCESS;
+	int32_t iCurIdx    = WelsStaticCast(int32_t, WelsVpGetValidMethod(iType)) - 1;
+
+	if (!pParam)
+		return RET_INVALIDPARAM;
+
+	WelsMutexLock(&m_mutes);
+
+	IStrategy *pStrategy = m_pStgChain[iCurIdx];
+	if (pStrategy)
+		eReturn = pStrategy->Get(0, pParam);
+
+	WelsMutexUnlock(&m_mutes);
+
+	return eReturn;
+}
+
+EResult CVpFrameWork::Set(int32_t iType, void *pParam)
+{
+	EResult eReturn        = RET_SUCCESS;
+	int32_t iCurIdx    = WelsStaticCast(int32_t, WelsVpGetValidMethod(iType)) - 1;
+
+	if (!pParam)
+		return RET_INVALIDPARAM;
+
+	WelsMutexLock(&m_mutes);
+
+	IStrategy *pStrategy = m_pStgChain[iCurIdx];
+	if (pStrategy)
+		eReturn = pStrategy->Set(0, pParam);
+
+	WelsMutexUnlock(&m_mutes);
+
+	return eReturn;
+}
+
+EResult CVpFrameWork::SpecialFeature(int32_t iType, void *pIn, void *pOut)
+{
+	EResult eReturn        = RET_SUCCESS;
+
+	return eReturn;
+}
+
+bool_t  CVpFrameWork::CheckValid(EMethods eMethod, SPixMap &pSrcPixMap, SPixMap &pDstPixMap)
+{
+	bool_t eReturn = FALSE;
+
+	if (eMethod == METHOD_NULL)
+		goto exit;
+
+	if (eMethod != METHOD_COLORSPACE_CONVERT)
+	{
+		if (pSrcPixMap.pPixel[0])
+		{
+			if (pSrcPixMap.eFormat != VIDEO_FORMAT_I420 && pSrcPixMap.eFormat != VIDEO_FORMAT_YV12)
+				goto exit;
+		}
+		if (pSrcPixMap.pPixel[0] && pDstPixMap.pPixel[0])
+		{
+			if (pDstPixMap.eFormat != pSrcPixMap.eFormat)
+				goto exit;
+		}
+	}
+
+	if (pSrcPixMap.pPixel[0])
+	{
+		if (pSrcPixMap.sRect.iRectWidth <= 0 || pSrcPixMap.sRect.iRectWidth > MAX_WIDTH || pSrcPixMap.sRect.iRectHeight <= 0 || pSrcPixMap.sRect.iRectHeight > MAX_HEIGHT)
+			goto exit;
+		if (pSrcPixMap.sRect.iRectTop >= pSrcPixMap.sRect.iRectHeight || pSrcPixMap.sRect.iRectLeft >= pSrcPixMap.sRect.iRectWidth || pSrcPixMap.sRect.iRectWidth > pSrcPixMap.iStride[0])
+			goto exit;
+	}
+	if (pDstPixMap.pPixel[0])
+	{
+		if (pDstPixMap.sRect.iRectWidth <= 0 || pDstPixMap.sRect.iRectWidth > MAX_WIDTH || pDstPixMap.sRect.iRectHeight <= 0 || pDstPixMap.sRect.iRectHeight > MAX_HEIGHT)
+			goto exit;
+		if (pDstPixMap.sRect.iRectTop >= pDstPixMap.sRect.iRectHeight || pDstPixMap.sRect.iRectLeft >= pDstPixMap.sRect.iRectWidth || pDstPixMap.sRect.iRectWidth > pDstPixMap.iStride[0])
+			goto exit;
+	}
+	eReturn = TRUE;
+
+exit:
+	return eReturn;
+}
+
+IStrategy* CVpFrameWork::CreateStrategy(EMethods m_eMethod, int32_t iCpuFlag)
+{
+	IStrategy *pStrategy = NULL;
+
+	switch (m_eMethod)
+	{
+	case METHOD_COLORSPACE_CONVERT:
+		//not support yet
+		break;
+	case METHOD_DENOISE:
+		pStrategy = WelsDynamicCast(IStrategy *, new CDenoiser(iCpuFlag));
+		break;
+	case METHOD_SCENE_CHANGE_DETECTION:
+		pStrategy = WelsDynamicCast(IStrategy *, new CSceneChangeDetection(iCpuFlag));
+		break;
+	case METHOD_DOWNSAMPLE:
+		pStrategy = WelsDynamicCast(IStrategy *, new CDownsampling(iCpuFlag));
+		break;
+	case METHOD_VAA_STATISTICS:
+		pStrategy = WelsDynamicCast(IStrategy *, new CVAACalculation(iCpuFlag));
+		break;
+	case METHOD_BACKGROUND_DETECTION:
+		pStrategy = WelsDynamicCast(IStrategy *, new CBackgroundDetection(iCpuFlag));
+		break;
+	case METHOD_ADAPTIVE_QUANT:
+		pStrategy = WelsDynamicCast(IStrategy *, new CAdaptiveQuantization(iCpuFlag));
+		break;
+	case METHOD_COMPLEXITY_ANALYSIS:
+		pStrategy = WelsDynamicCast(IStrategy *, new CComplexityAnalysis(iCpuFlag));
+		break;
+	case METHOD_IMAGE_ROTATE:
+		pStrategy = WelsDynamicCast(IStrategy *, new CImageRotating(iCpuFlag));
+		break;
+	default:
+		break;
+	}
+
+	return pStrategy;
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/common/WelsFrameWork.h
@@ -1,0 +1,121 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  WelsFrameWork.h
+ *
+ * \brief	    :  framework of wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_WELSFRAMEWORK_H
+#define _WELSVP_WELSFRAMEWORK_H
+
+#include "../../interface/IWelsVP.h"
+#include "util.h"
+#include "thread.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+EResult CreateSpecificVpInterface (IWelsVP **ppCtx);
+EResult DestroySpecificVpInterface(IWelsVP *pCtx );
+
+EResult CreateSpecificVpInterface (IWelsVPc **ppCtx);
+EResult DestroySpecificVpInterface(IWelsVPc *pCtx );
+
+#define MAX_STRATEGY_NUM (METHOD_MASK - 1)
+
+class IStrategy : public IWelsVP
+{
+public:		
+	IStrategy() 
+	{
+		m_eMethod  = METHOD_NULL;
+		m_eFormat  = VIDEO_FORMAT_I420;
+		m_iIndex   = 0;		
+		m_bInit    = FALSE;
+	};
+
+	virtual ~IStrategy() {}
+
+public:
+	virtual EResult Init(int32_t iType, void *pCfg)  { return RET_SUCCESS; } 
+	virtual EResult Uninit(int32_t iType)              { return RET_SUCCESS; }
+	virtual EResult Flush(int32_t iType)               { return RET_SUCCESS; }		
+	virtual EResult Get(int32_t iType, void *pParam) { return RET_SUCCESS; } 
+	virtual EResult Set(int32_t iType, void *pParam) { return RET_SUCCESS; } 
+	virtual EResult SpecialFeature(int32_t iType, void *pIn, void *pOut) { return RET_SUCCESS; }
+	virtual EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst) = 0; 		
+
+public:
+	EMethods       m_eMethod;
+	EVideoFormat m_eFormat;
+	int32_t           m_iIndex;		
+	bool_t            m_bInit;			
+};
+
+class CVpFrameWork : public IWelsVP
+{
+public:
+	CVpFrameWork(uint32_t uiThreadsNum, EResult &ret);
+	~CVpFrameWork();
+
+public:
+	EResult Init(int32_t iType, void *pCfg); 
+
+	EResult Uninit(int32_t iType);
+
+	EResult Flush(int32_t iType);
+
+	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst); 
+
+	EResult Get(int32_t iType, void *pParam); 
+
+	EResult Set(int32_t iType, void *pParam); 
+
+	EResult SpecialFeature(int32_t iType, void *pIn, void *pOut);
+
+private:
+	bool_t  CheckValid(EMethods eMethod, SPixMap &sSrc, SPixMap &sDst);
+	IStrategy *CreateStrategy(EMethods eMethod, int32_t iCpuFlag);	
+
+private:
+	IStrategy *m_pStgChain[MAX_STRATEGY_NUM];
+
+	WELS_MUTEX m_mutes;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/common/WelsFrameWorkEx.cpp
@@ -1,0 +1,109 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "WelsFrameWork.h"
+
+///////////////////////////////////////////////////////////////////////
+
+WELSVP_NAMESPACE_BEGIN
+
+EResult Init (void *pCtx, int32_t iType, void *pCfg)
+{
+	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Init(iType, pCfg) : RET_INVALIDPARAM;
+}
+EResult Uninit (void *pCtx, int32_t iType)
+{
+	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Uninit(iType) : RET_INVALIDPARAM;
+}
+EResult Flush (void *pCtx, int32_t iType)
+{
+	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Flush(iType) : RET_INVALIDPARAM;
+}
+EResult Process (void *pCtx, int32_t iType, SPixMap *pSrc, SPixMap *dst)
+{
+	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Process(iType, pSrc, dst) : RET_INVALIDPARAM;
+}
+EResult Get (void *pCtx, int32_t iType, void *pParam)
+{
+	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Get(iType, pParam) : RET_INVALIDPARAM;
+}
+EResult Set (void *pCtx, int32_t iType, void *pParam)
+{
+	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->Set(iType, pParam) : RET_INVALIDPARAM;
+}
+EResult SpecialFeature (void *pCtx, int32_t iType, void *pIn, void *pOut)
+{
+	return pCtx ? WelsStaticCast(IWelsVP *, pCtx)->SpecialFeature(iType, pIn, pOut) : RET_INVALIDPARAM;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+EResult CreateSpecificVpInterface(IWelsVPc **pCtx)
+{
+	EResult  ret     = RET_FAILED;
+	IWelsVP *pWelsVP = NULL;
+
+	ret = CreateSpecificVpInterface(&pWelsVP);
+	if (ret == RET_SUCCESS)
+	{
+		IWelsVPc *pVPc = new IWelsVPc;
+		if (pVPc)
+		{
+			pVPc->Init    = Init;
+			pVPc->Uninit  = Uninit;
+			pVPc->Flush   = Flush;
+			pVPc->Process = Process;
+			pVPc->Get     = Get;
+			pVPc->Set     = Set;
+			pVPc->SpecialFeature = SpecialFeature;
+			pVPc->pCtx       = WelsStaticCast(void *, pWelsVP);
+			*pCtx            = pVPc;
+		}
+		else 
+			ret = RET_OUTOFMEMORY;
+	}
+
+	return ret;
+}
+
+EResult DestroySpecificVpInterface(IWelsVPc *pCtx)
+{
+	if (pCtx)
+	{
+		DestroySpecificVpInterface(WelsStaticCast(IWelsVP *, pCtx->pCtx));
+		_SafeDelete(pCtx);
+	}
+
+	return RET_SUCCESS;
+}
+
+WELSVP_NAMESPACE_END
binary files /dev/null b/processing/src/common/WelsVP.aps differ
--- /dev/null
+++ b/processing/src/common/WelsVP.def
@@ -1,0 +1,36 @@
+;*!
+;* \copy
+;*     Copyright (c)  2011-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY		    welsvp.dll
+EXPORTS
+                CreateVpInterface    PRIVATE
+                DestroyVpInterface   PRIVATE      
\ No newline at end of file
--- /dev/null
+++ b/processing/src/common/WelsVP.rc
@@ -1,0 +1,115 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+#include "afxres.h"
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// Chinese (P.R.C.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_CHS)
+#ifdef _WIN32
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+#pragma code_page(936)
+#endif //_WIN32
+
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE 
+BEGIN
+    "resource.h\0"
+END
+
+2 TEXTINCLUDE 
+BEGIN
+    "#include ""afxres.h""\r\n"
+    "\0"
+END
+
+3 TEXTINCLUDE 
+BEGIN
+    "\r\n"
+    "\0"
+END
+
+#endif    // APSTUDIO_INVOKED
+
+#endif    // Chinese (P.R.C.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////
+// English (U.S.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+#ifdef _WIN32
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+#pragma code_page(1252)
+#endif //_WIN32
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION 0,0,0,0
+ PRODUCTVERSION 0,0,0,0
+ FILEFLAGSMASK 0x3fL
+#ifdef _DEBUG
+ FILEFLAGS 0x1L
+#else
+ FILEFLAGS 0x0L
+#endif
+ FILEOS 0x40004L
+ FILETYPE 0x2L
+ FILESUBTYPE 0x0L
+BEGIN
+    BLOCK "StringFileInfo"
+    BEGIN
+        BLOCK "040904b0"
+        BEGIN
+            VALUE "Comments", "Cisco OpenH264  video preprocessing"
+            VALUE "CompanyName", "Cisco Systems"
+            VALUE "FileDescription", "Cisco OpenH264  video preprocessing"
+            VALUE "FileVersion", "0, 0, 0, 0"
+            VALUE "InternalName", "welsvp.dll"
+            VALUE "LegalCopyright", "� 2011-2015 Cisco and/or its affiliates. All rights reserved."
+            VALUE "OriginalFilename", "welsvp.dll"
+            VALUE "ProductName", "Cisco OpenH264 video preprocessing"
+            VALUE "ProductVersion", "0, 0, 0, 0"
+        END
+    END
+    BLOCK "VarFileInfo"
+    BEGIN
+        VALUE "Translation", 0x409, 1200
+    END
+END
+
+#endif    // English (U.S.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+
+
+/////////////////////////////////////////////////////////////////////////////
+#endif    // not APSTUDIO_INVOKED
+
--- /dev/null
+++ b/processing/src/common/cpu.cpp
@@ -1,0 +1,213 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	cpu.c
+ *
+ * \brief	CPU compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "util.h"
+#include "cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define    CPU_Vender_AMD    "AuthenticAMD"
+#define    CPU_Vender_INTEL  "GenuineIntel"
+#define    CPU_Vender_CYRIX  "CyrixInstead"
+
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors )
+{
+    uint32_t uiCPU = 0;	
+    uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+	int32_t  CacheLineSize = 0;
+	int8_t   chVenderName[16] = { 0 };	
+	
+    if( !WelsCPUIdVerify() )
+    {
+        /* cpuid is not supported in cpu */
+        return 0;
+    }
+	
+	WelsCPUId( 0, &uiFeatureA, (uint32_t*)&chVenderName[0],(uint32_t*)&chVenderName[8],(uint32_t*)&chVenderName[4] );
+    if( uiFeatureA == 0 )
+    {
+		/* maximum input value for basic cpuid information */
+        return 0;
+    }
+	
+	WelsCPUId( 1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+    if( (uiFeatureD & 0x00800000) == 0 )
+    {
+        /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+        return 0;
+    }
+	
+    uiCPU = WELS_CPU_MMX;
+    if( uiFeatureD & 0x02000000 )
+    {
+        /* SSE technology is identical to AMD MMX extensions */
+        uiCPU |= WELS_CPU_MMXEXT|WELS_CPU_SSE;
+    }
+    if( uiFeatureD & 0x04000000 )
+    {
+        /* SSE2 support here */
+        uiCPU |= WELS_CPU_SSE2;
+    }
+	if ( uiFeatureD & 0x00000001 )
+	{
+		/* x87 FPU on-chip checking */
+		uiCPU |= WELS_CPU_FPU;
+	}
+	if ( uiFeatureD & 0x00008000 )
+	{
+		/* CMOV instruction checking */
+		uiCPU |= WELS_CPU_CMOV;
+	}
+	if ( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) )	// confirmed_safe_unsafe_usage
+	{
+		if ( uiFeatureD & 0x10000000 )
+		{
+			/* Multi-Threading checking: contains of multiple logic processors */
+			uiCPU |= WELS_CPU_HTT;
+		}
+	}	
+
+	if( uiFeatureC & 0x00000001 ){
+		/* SSE3 support here */
+		uiCPU |= WELS_CPU_SSE3;
+	}
+	if( uiFeatureC & 0x00000200 ){
+		/* SSSE3 support here */
+		uiCPU |= WELS_CPU_SSSE3;
+	}
+	if( uiFeatureC & 0x00080000 ){
+		/* SSE4.1 support here, 45nm Penryn processor */
+		uiCPU |= WELS_CPU_SSE41; 
+	}
+	if( uiFeatureC & 0x00100000 ){
+		/* SSE4.2 support here, next generation Nehalem processor */
+		uiCPU |= WELS_CPU_SSE42;
+	}
+	if ( WelsCPUSupportAVX( uiFeatureA, uiFeatureC ) )	// 
+	{
+		/* AVX supported */
+		uiCPU |= WELS_CPU_AVX;
+	}
+	if ( WelsCPUSupportFMA( uiFeatureA, uiFeatureC ) )	// 
+	{
+		/* AVX FMA supported */
+		uiCPU |= WELS_CPU_FMA;
+	}
+	if ( uiFeatureC & 0x02000000 )
+	{
+		/* AES checking */
+		uiCPU |= WELS_CPU_AES;
+	}
+	if ( uiFeatureC & 0x00400000 )
+	{
+		/* MOVBE checking */
+		uiCPU |= WELS_CPU_MOVBE;
+	}
+
+	if ( pNumberOfLogicProcessors != NULL )
+	{
+		// HTT enabled on chip
+		*pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX		
+	}	
+	
+    WelsCPUId( 0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+
+	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_AMD)) && (uiFeatureA>=0x80000001) ){	// confirmed_safe_unsafe_usage
+		WelsCPUId(0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD );
+		if( uiFeatureD&0x00400000 ){
+			uiCPU |= WELS_CPU_MMXEXT;
+		}
+		if( uiFeatureD&0x80000000 ){
+			uiCPU |= WELS_CPU_3DNOW;
+		}
+	}
+
+	if( !strcmp((const str_t*)chVenderName,CPU_Vender_INTEL) ){	// confirmed_safe_unsafe_usage
+		int32_t  family, model;
+
+		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+		family = ((uiFeatureA>>8)&0xf) + ((uiFeatureA>>20)&0xff);
+        model  = ((uiFeatureA>>4)&0xf) + ((uiFeatureA>>12)&0xf0);
+
+		if( (family==6) && (model==9 || model==13 || model==14) ){
+			uiCPU &= ~(WELS_CPU_SSE2|WELS_CPU_SSE3);
+		}
+	}
+
+	// get cache line size
+	if( (!strcmp((const str_t*)chVenderName,CPU_Vender_INTEL)) || !(strcmp((const str_t*)chVenderName,CPU_Vender_CYRIX)) ){	// confirmed_safe_unsafe_usage
+		WelsCPUId(1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+		CacheLineSize = (uiFeatureB&0xff00)>>5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+		if( CacheLineSize == 128 ){
+			uiCPU |= WELS_CPU_CACHELINE_128;
+		}
+		else if( CacheLineSize == 64 ){
+			uiCPU |= WELS_CPU_CACHELINE_64;
+		}
+		else if( CacheLineSize == 32 ){
+			uiCPU |= WELS_CPU_CACHELINE_32;
+		}
+		else if( CacheLineSize == 16 ){
+			uiCPU |= WELS_CPU_CACHELINE_16;
+		}
+	}
+	
+    return uiCPU;
+}
+
+
+void WelsCPURestore( const uint32_t kuiCPU )
+{
+    if( kuiCPU & (WELS_CPU_MMX|WELS_CPU_MMXEXT|WELS_CPU_3DNOW|WELS_CPU_3DNOWEXT) )
+    {
+        WelsEmms();
+    }
+}
+
+#endif
+
+
+WELSVP_NAMESPACE_END
+
+
--- /dev/null
+++ b/processing/src/common/cpu.h
@@ -1,0 +1,102 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	cpu.h
+ *
+ * \brief	CPU feature compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_CPU_H
+#define _WELSVP_CPU_H
+
+#include "typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+/*
+ *	WELS CPU feature flags
+ */ 
+#define WELS_CPU_MMX        0x00000001    /* mmx */
+#define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
+#define WELS_CPU_SSE        0x00000004    /* sse */
+#define WELS_CPU_SSE2       0x00000008    /* sse 2 */
+#define WELS_CPU_SSE3       0x00000010    /* sse 3 */
+#define WELS_CPU_SSE41      0x00000020    /* sse 4.1 */
+#define WELS_CPU_3DNOW      0x00000040    /* 3dnow! */
+#define WELS_CPU_3DNOWEXT   0x00000080    /* 3dnow! ext */
+#define WELS_CPU_ALTIVEC    0x00000100    /* altivec */
+#define WELS_CPU_SSSE3      0x00000200    /* ssse3 */
+#define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
+
+/* CPU features application extensive */
+#define WELS_CPU_AVX		0x00000800	/* Advanced Vector eXtentions */
+#define WELS_CPU_FPU		0x00001000	/* x87-FPU on chip */
+#define WELS_CPU_HTT		0x00002000	/* Hyper-Threading Technology (HTT), Multi-threading enabled feature: 
+										   physical processor package is capable of supporting more than one logic processor
+										*/
+#define WELS_CPU_CMOV		0x00004000	/* Conditional Move Instructions,
+										   also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
+										*/
+#define WELS_CPU_MOVBE		0x00008000	/* MOVBE instruction */
+#define WELS_CPU_AES		0x00010000	/* AES instruction extensions */
+#define WELS_CPU_FMA		0x00020000	/* AVX VEX FMA instruction sets */
+
+#define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
+#define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
+#define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
+#define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
+
+/*
+ *	Interfaces for CPU core feature detection as below
+ */
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+
+int32_t WelsCPUIdVerify();
+
+void  WelsCPUId( uint32_t uiIndex, uint32_t *pFeatureA, uint32_t *pFeatureB, uint32_t *pFeatureC, uint32_t *pFeatureD );
+int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx );
+int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx );
+
+void  WelsEmms();
+
+WELSVP_EXTERN_C_END
+#endif
+
+uint32_t WelsCPUFeatureDetect( int32_t *pNumberOfLogicProcessors );
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/common/memory.cpp
@@ -1,0 +1,128 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "memory.h"
+
+WELSVP_NAMESPACE_BEGIN
+/////////////////////////////////////////////////////////////////////////////////
+
+ void * WelsMalloc( const uint32_t kuiSize, str_t *pTag )
+ {
+	 const int32_t kiSizeVoidPointer	= sizeof( void ** );
+	 const int32_t kiSizeInt32		= sizeof( int32_t );
+	 const int32_t kiAlignedBytes	= ALIGNBYTES - 1;
+
+	 uint8_t* pBuf		= (uint8_t *) ::malloc( kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32 );
+	 uint8_t* pAlignedBuf = NULL;
+
+	 if ( NULL == pBuf )
+		 return NULL;
+
+	 // to fill zero values
+	 WelsMemset( pBuf, 0, kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32 );
+
+	 pAlignedBuf = pBuf + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32;
+	 pAlignedBuf -= WelsCastFromPointer(pAlignedBuf) & kiAlignedBytes;
+	 *( (void **) ( pAlignedBuf - kiSizeVoidPointer ) ) = pBuf;
+	 *( (int32_t *) ( pAlignedBuf - (kiSizeVoidPointer + kiSizeInt32) ) ) = kuiSize;
+
+	 return (pAlignedBuf);
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+
+ void WelsFree( void* pPointer, str_t *pTag )
+ {
+	 if( pPointer )
+	 {
+		 ::free( *( ( ( void **) pPointer ) - 1 ) );
+	 }
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+
+ void* InternalReallocate( void *pPointer, const uint32_t kuiSize, str_t *pTag )
+ {
+	 uint32_t iOldSize = 0;
+	 uint8_t* pNew = NULL;
+	 if ( pPointer != NULL ) 
+		 iOldSize = *( (int32_t*) ( (uint8_t*) pPointer - sizeof( void ** ) - sizeof( int32_t ) ) ); 
+	 else
+		 return WelsMalloc( kuiSize, pTag );
+
+	 pNew = (uint8_t*)WelsMalloc( kuiSize, pTag );
+	 if (0 == pNew)
+	 {
+		 if (iOldSize > 0 && kuiSize > 0 && iOldSize >= kuiSize)
+			 return (pPointer);
+		 return 0;
+	 }
+	 else 
+		 if( iOldSize > 0 && kuiSize > 0 )
+			 memcpy( pNew, pPointer, ( iOldSize < kuiSize ) ? iOldSize : kuiSize );
+		 else
+			 return 0;
+
+	 WelsFree( pPointer, pTag );
+	 return (pNew);
+ }
+
+ /////////////////////////////////////////////////////////////////////////////
+
+ void* WelsRealloc( void *pPointer, uint32_t *pRealSize, const uint32_t kuiSize, str_t *pTag )
+ {
+	 const uint32_t kuiOldSize = *pRealSize;
+	 uint32_t kuiNewSize = 0;
+	 void *pLocalPointer = NULL;
+	 if ( kuiOldSize >= kuiSize )	// large enough of original block, so do nothing
+		 return (pPointer);
+
+	 // new request
+	 kuiNewSize = kuiSize + 15;
+	 kuiNewSize -= (kuiNewSize & 15);
+	 kuiNewSize += 32;
+
+	 pLocalPointer = InternalReallocate( pPointer, kuiNewSize, pTag );
+	 if ( NULL != pLocalPointer )
+	 {
+		 *pRealSize	= kuiNewSize;
+		 return (pLocalPointer);
+	 }
+	 else
+	 {
+		 return NULL;
+	 }
+
+	 return NULL;	// something wrong
+ }
+
+ WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/common/memory.h
@@ -1,0 +1,113 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  memory.h
+ *
+ * \brief	    :  memory definition for wels video processor class
+ *
+ * \date        :  2011/02/22
+ *
+ * \description :  
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_MEMORY_H
+#define _WELSVP_MEMORY_H
+
+#include "util.h"
+#include "typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+inline_t void * WelsMemset( void * pPointer, int32_t iValue, uint32_t uiSize)
+{
+	return ::memset(pPointer, iValue, uiSize);
+}
+
+inline_t void * WelsMemcpy( void *pDst, const void *kpSrc, uint32_t uiSize)
+{
+	return ::memcpy(pDst, kpSrc, uiSize);
+}
+
+inline_t int32_t WelsMemcmp( const void *kpBuf1, const void *kpBuf2, uint32_t uiSize)
+{
+	return ::memcmp( kpBuf1, kpBuf2, uiSize);
+}
+
+/*! 
+*************************************************************************************
+* \brief	malloc with zero filled utilization in Wels
+*
+* \param 	i_size	uiSize of memory block required
+*
+* \return	allocated memory pointer exactly, failed in case of NULL return
+*
+* \note	N/A
+*************************************************************************************
+*/
+void * WelsMalloc( const uint32_t kuiSize, str_t *pTag = NULL );
+
+/*! 
+*************************************************************************************
+* \brief	free utilization in Wels
+*
+* \param 	p	data pointer to be free. 
+*			i.e, uint8_t *p = actual data to be free, argv = &p.
+*
+* \return	NONE
+*
+* \note	N/A
+*************************************************************************************
+*/
+void WelsFree( void * pPointer, str_t *pTag = NULL );
+
+/*! 
+*************************************************************************************
+* \brief	reallocation in Wels. Do nothing and continue using old block 
+*		in case the block is large enough currently
+*
+* \param 	p	    memory block required in old time
+* \param	i_size	new uiSize of memory block requested
+* \param	sz_real	pointer to the old uiSize of memory block
+*
+* \return	reallocated memory pointer exactly, failed in case of NULL return
+*
+* \note	N/A
+*************************************************************************************
+*/
+void * WelsRealloc( void  *pPointer, uint32_t *pRealSize, const uint32_t kuiSize, str_t *pTag = NULL );
+
+//////////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
+
+#endif
+
+
--- /dev/null
+++ b/processing/src/common/resource.h
@@ -1,0 +1,15 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by WelsVP.rc
+//
+
+// Next default values for new objects
+// 
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE        101
+#define _APS_NEXT_COMMAND_VALUE         40001
+#define _APS_NEXT_CONTROL_VALUE         1000
+#define _APS_NEXT_SYMED_VALUE           101
+#endif
+#endif
--- /dev/null
+++ b/processing/src/common/thread.cpp
@@ -1,0 +1,101 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	thread.cpp
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "thread.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(WIN32)
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex )
+{
+	InitializeCriticalSection(mutex);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex )
+{
+	EnterCriticalSection(mutex);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex )
+{
+	LeaveCriticalSection(mutex);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex )
+{
+    DeleteCriticalSection(mutex);
+
+	return WELS_THREAD_ERROR_OK;
+}
+
+#elif  defined(__GNUC__)
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit( WELS_MUTEX   * mutex )
+{
+	return pthread_mutex_init(mutex, NULL);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock( WELS_MUTEX   * mutex )
+{
+	return pthread_mutex_lock(mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock( WELS_MUTEX * mutex )
+{
+	return pthread_mutex_unlock(mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy( WELS_MUTEX * mutex )
+{
+    return pthread_mutex_destroy(mutex);
+}
+
+#endif
+
+WELSVP_NAMESPACE_END
+
+
+
--- /dev/null
+++ b/processing/src/common/thread.h
@@ -1,0 +1,89 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	thread.h
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created 
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_THREAD_H
+#define _WELSVP_THREAD_H
+
+#include "typedef.h"
+
+#if defined(WIN32)
+
+#include <windows.h>
+
+#elif defined(__GNUC__) 
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <errno.h>
+
+#endif//WIN32
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(WIN32)
+
+typedef  HANDLE            WELS_THREAD_HANDLE;
+typedef  CRITICAL_SECTION  WELS_MUTEX;
+
+#elif defined(__GNUC__) 
+
+typedef   pthread_t         WELS_THREAD_HANDLE;
+typedef   pthread_mutex_t   WELS_MUTEX;
+
+#endif
+
+typedef long_t WELS_THREAD_ERROR_CODE;
+
+#define   WELS_THREAD_ERROR_OK					0
+#define   WELS_THREAD_ERROR_GENERIAL			((unsigned long)(-1))
+#define   WELS_THREAD_ERROR_WAIT_OBJECT_0		0
+#define	  WELS_THREAD_ERROR_WAIT_TIMEOUT		((unsigned long)0x00000102L)  
+#define	  WELS_THREAD_ERROR_WAIT_FAILED		    WELS_THREAD_ERROR_GENERIAL
+
+WELS_THREAD_ERROR_CODE   WelsMutexInit( WELS_MUTEX   * mutex );
+WELS_THREAD_ERROR_CODE   WelsMutexLock( WELS_MUTEX   * mutex );
+WELS_THREAD_ERROR_CODE   WelsMutexUnlock( WELS_MUTEX * mutex );
+WELS_THREAD_ERROR_CODE   WelsMutexDestroy( WELS_MUTEX * mutex );
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/common/typedef.h
@@ -1,0 +1,102 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  typedef.h
+ *
+ * \brief	    :  basic type definition 
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  1. Define basic type with platform-independent;
+ *                 2. Define specific namespace to avoid name pollution;
+ *                 3. C++ ONLY;             
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_TYPEDEF_H
+#define _WELSVP_TYPEDEF_H
+
+#define WELSVP_EXTERN_C_BEGIN       extern "C" {
+#define WELSVP_EXTERN_C_END         }
+
+#define WELSVP_NAMESPACE_BEGIN      namespace nsWelsVP {
+#define WELSVP_NAMESPACE_END        }
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(WIN32) || defined(_WIN32) || defined(_MSC_VER)
+
+typedef char               int8_t   ;
+typedef unsigned char      uint8_t  ;
+typedef short              int16_t  ;
+typedef unsigned short     uint16_t ;
+typedef int                int32_t  ;
+typedef unsigned int       uint32_t ;
+typedef __int64            int64_t  ;
+typedef unsigned __int64   uint64_t ;
+#define inline_t           _inline
+
+#else	// GCC
+
+typedef signed char        int8_t   ; // [comment]: some compilers may identify the type "char" as "unsigned char" as default, so declare it explicit 
+typedef unsigned char      uint8_t  ;
+typedef signed short       int16_t  ;
+typedef unsigned short     uint16_t ;
+typedef signed int         int32_t  ;
+typedef unsigned int       uint32_t ;
+typedef long long          int64_t  ;
+typedef unsigned long long uint64_t ;
+#define inline_t           inline
+
+#endif 
+
+typedef char    str_t    ; // [comment]: specific use plain char only for character parameters
+typedef long    long_t   ;
+typedef int32_t bool_t   ;
+
+#if defined(WIN32) || defined(_MACH_PLATFORM) || defined(__GNUC__)
+typedef float   float_t  ;
+typedef double  double_t ; 
+#endif
+
+#ifndef NULL
+#define NULL    0
+#endif
+
+enum
+{
+   FALSE = 0,
+   TRUE  = !FALSE
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/common/util.cpp
@@ -1,0 +1,46 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "util.h"
+
+WELSVP_NAMESPACE_BEGIN
+/////////////////////////////////////////////////////////////////////////////////
+
+
+int32_t  WelsStrCmp(const str_t * kpStr1, const str_t * kpStr2)
+{
+	return ::strcmp(kpStr1, kpStr2);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/common/util.h
@@ -1,0 +1,108 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  util.h
+ *
+ * \brief	    :  utils for wels video processor class
+ *
+ * \date        :  2011/01/04
+ *
+ * \description :  
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_UTIL_H
+#define _WELSVP_UTIL_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include "typedef.h"
+#include "memory.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define MAX_WIDTH      (4096)
+#define MAX_HEIGHT     (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
+#define MB_WIDTH_LUMA  (16)
+#define PESN		   (1e-6)	// desired float precision
+
+#define MB_TYPE_INTRA4x4		0x00000001
+#define MB_TYPE_INTRA16x16	0x00000002
+#define MB_TYPE_INTRA_PCM		0x00000004
+#define MB_TYPE_INTRA			  (MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA_PCM)
+#define IS_INTRA(type) ((type)&MB_TYPE_INTRA)
+
+#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
+#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
+#define WELS_SIGN(a)	((long_t)(a) >> 31)
+#define WELS_ABS(a)		((WELS_SIGN(a) ^ (long_t)(a)) - WELS_SIGN(a))
+#define WELS_CLAMP(x, minv, maxv)  WELS_MIN(WELS_MAX(x, minv), maxv)
+
+#define ALIGNBYTES         (16)       /* Worst case is requiring alignment to an 16 byte boundary */
+#define WELS_ALIGN(iInput)   ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
+#define WELS_ALIGN2(iInput)  ((iInput+1) & ~1)
+#define WELS_ALIGN4(iInput)  ((iInput+3) & ~3)
+#define WELS_ALIGN8(iInput)  ((iInput+7) & ~7)
+
+#define WelsCastFromPointer(p)      (reinterpret_cast<long_t>(p))
+#define WelsStaticCast(type, p)  (static_cast<type>(p))
+#define WelsDynamicCast(type, p) (dynamic_cast<type>(p))
+
+#define GET_METHOD(x)  ((x) & 0xff)          // mask method as the lowest 8bits
+#define GET_SPECIAL(x) (((x) >> 8) & 0xff)   // mask special flag as 8bits
+
+inline_t EMethods WelsVpGetValidMethod(int32_t a)
+{
+   int32_t iMethod = GET_METHOD(a);
+   return WelsStaticCast(EMethods, WELS_CLAMP(iMethod, METHOD_NULL+1, METHOD_MASK-1));
+}
+
+
+#define _SafeFree(p)		if (p) { WelsFree(p); (p) = NULL; }
+#define _SafeDelete(p)		if (p) { delete (p); (p) = NULL; }
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+
+int32_t   WelsStrCmp(const str_t * kpStr1, const str_t * kpStr2);
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
+
+#endif
+
+
--- /dev/null
+++ b/processing/src/complexityanalysis/ComplexityAnalysis.cpp
@@ -1,0 +1,325 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "ComplexityAnalysis.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CComplexityAnalysis::CComplexityAnalysis(int32_t iCpuFlag)
+{
+	m_eMethod   = METHOD_COMPLEXITY_ANALYSIS;
+	m_pfGomSad   = NULL;
+	WelsMemset( &m_sComplexityAnalysisParam, 0, sizeof(m_sComplexityAnalysisParam) );
+}
+
+CComplexityAnalysis::~CComplexityAnalysis()
+{	
+}
+
+EResult CComplexityAnalysis::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
+{
+	EResult eReturn = RET_SUCCESS;	
+
+	switch (m_sComplexityAnalysisParam.iComplexityAnalysisMode)
+	{
+	case FRAME_SAD:
+		AnalyzeFrameComplexityViaSad( pSrcPixMap, pRefPixMap );
+		break;
+	case GOM_SAD:
+		AnalyzeGomComplexityViaSad( pSrcPixMap, pRefPixMap );
+		break;
+	case GOM_VAR:
+		AnalyzeGomComplexityViaVar( pSrcPixMap, pRefPixMap );
+		break;
+	default:
+		eReturn = RET_INVALIDPARAM;
+		break;
+	}	
+
+	return eReturn;
+}
+
+
+EResult CComplexityAnalysis::Set(int32_t iType, void *pParam)
+{
+	if (pParam == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	m_sComplexityAnalysisParam = *(SComplexityAnalysisParam *)pParam;
+
+	return RET_SUCCESS;
+}
+
+EResult CComplexityAnalysis::Get(int32_t iType, void *pParam)
+{
+	if (pParam == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	SComplexityAnalysisParam * sComplexityAnalysisParam = (SComplexityAnalysisParam *)pParam;
+
+	sComplexityAnalysisParam->iFrameComplexity = m_sComplexityAnalysisParam.iFrameComplexity;
+
+	return RET_SUCCESS;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+void CComplexityAnalysis::AnalyzeFrameComplexityViaSad( SPixMap *pSrcPixMap, SPixMap *pRefPixMap )
+{
+	SVAACalcResult     *pVaaCalcResults = NULL;
+	pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+
+	m_sComplexityAnalysisParam.iFrameComplexity = pVaaCalcResults->iFrameSad;
+
+	if ( m_sComplexityAnalysisParam.iCalcBgd ) //BGD control
+	{
+		m_sComplexityAnalysisParam.iFrameComplexity = (int32_t)GetFrameSadExcludeBackground( pSrcPixMap, pRefPixMap );
+	}
+}
+
+int32_t CComplexityAnalysis::GetFrameSadExcludeBackground( SPixMap *pSrcPixMap, SPixMap *pRefPixMap )
+{
+	int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+	int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;	
+	int32_t iMbWidth  = iWidth  >> 4;
+	int32_t iMbHeight = iHeight >> 4;
+	int32_t iMbNum    = iMbWidth * iMbHeight;
+
+	int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+	int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1 ) / iMbNumInGom;
+	int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0;
+
+	uint8_t *pBackgroundMbFlag = (uint8_t *)m_sComplexityAnalysisParam.pBackgroundMbFlag;
+	uint32_t*uiRefMbType = (uint32_t *)m_sComplexityAnalysisParam.uiRefMbType;
+	SVAACalcResult *pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+	int32_t  *pGomForegroundBlockNum = m_sComplexityAnalysisParam.pGomForegroundBlockNum;
+
+	uint32_t uiFrameSad = 0;
+	for ( int32_t j = 0; j < iGomMbNum; j ++ )
+	{
+		iGomMbStartIndex = j * iMbNumInGom;
+		iGomMbEndIndex = WELS_MIN( (j + 1) * iMbNumInGom, iMbNum);
+
+		for ( int32_t i = iGomMbStartIndex; i < iGomMbEndIndex; i ++)
+		{	
+			if ( pBackgroundMbFlag[i] == 0 || IS_INTRA(uiRefMbType[i]) )
+			{
+				pGomForegroundBlockNum[j]++;
+				uiFrameSad += pVaaCalcResults->pSad8x8[i][0];
+				uiFrameSad += pVaaCalcResults->pSad8x8[i][1];
+				uiFrameSad += pVaaCalcResults->pSad8x8[i][2];
+				uiFrameSad += pVaaCalcResults->pSad8x8[i][3];
+			}
+		}
+	}
+
+	return (uiFrameSad);
+}
+
+
+void InitGomSadFunc(PGOMSadFunc &pfGomSad, uint8_t iCalcBgd)
+{
+	pfGomSad = GomSampleSad;
+
+	if ( iCalcBgd )
+	{
+		pfGomSad = GomSampleSadExceptBackground;
+	}
+}
+
+void GomSampleSad(uint32_t *pGomSad, int32_t *pGomForegroundBlockNum, int32_t *pSad8x8, uint8_t pBackgroundMbFlag)
+{
+  (*pGomForegroundBlockNum) ++;
+  *pGomSad += pSad8x8[0];
+  *pGomSad += pSad8x8[1];
+  *pGomSad += pSad8x8[2];
+  *pGomSad += pSad8x8[3];
+}
+
+void GomSampleSadExceptBackground(uint32_t *pGomSad, int32_t *pGomForegroundBlockNum, int32_t *pSad8x8, uint8_t pBackgroundMbFlag)
+{
+  if ( pBackgroundMbFlag == 0 )
+  {
+    (*pGomForegroundBlockNum) ++;
+    *pGomSad += pSad8x8[0];
+    *pGomSad += pSad8x8[1];
+    *pGomSad += pSad8x8[2];
+    *pGomSad += pSad8x8[3];
+  }
+}
+
+void CComplexityAnalysis::AnalyzeGomComplexityViaSad( SPixMap *pSrcPixMap, SPixMap *pRefPixMap )
+{
+	int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+	int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;	
+	int32_t iMbWidth  = iWidth  >> 4;
+	int32_t iMbHeight = iHeight >> 4;
+	int32_t iMbNum    = iMbWidth * iMbHeight;
+
+	int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+	int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1 ) / iMbNumInGom;
+
+	int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
+	int32_t iMbStartIndex = 0, iMbEndIndex = 0;
+	int32_t iStartSampleIndex = 0;
+
+	uint8_t *pBackgroundMbFlag = (uint8_t *)m_sComplexityAnalysisParam.pBackgroundMbFlag;
+	uint32_t*uiRefMbType = (uint32_t *)m_sComplexityAnalysisParam.uiRefMbType;
+	SVAACalcResult *pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+	int32_t  *pGomForegroundBlockNum = (int32_t *)m_sComplexityAnalysisParam.pGomForegroundBlockNum;
+	int32_t  *pGomComplexity = (int32_t *)m_sComplexityAnalysisParam.pGomComplexity;
+
+	uint8_t *pRefY = NULL, *pSrcY = NULL;
+	int32_t iRefStride = 0, iCurStride = 0;
+
+	uint8_t *pRefTmp = NULL, *pCurTmp = NULL;
+	uint32_t uiGomSad = 0, uiFrameSad = 0;
+
+	pRefY = (uint8_t *)pRefPixMap->pPixel[0];
+	pSrcY = (uint8_t *)pSrcPixMap->pPixel[0];
+
+	iRefStride  = pRefPixMap->iStride[0];
+	iCurStride  = pSrcPixMap->iStride[0];
+
+	InitGomSadFunc( m_pfGomSad, m_sComplexityAnalysisParam.iCalcBgd );
+
+	for ( int32_t j = 0; j < iGomMbNum; j ++ )
+	{
+		uiGomSad = 0;
+
+		iGomMbStartIndex = j * iMbNumInGom;
+		iGomMbEndIndex = WELS_MIN( (j + 1) * iMbNumInGom, iMbNum);
+		iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1 ) / iMbWidth  - iGomMbStartIndex / iMbWidth;
+
+		iMbStartIndex = iGomMbStartIndex;
+		iMbEndIndex = WELS_MIN( (iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
+
+		iStartSampleIndex  = ( iMbStartIndex / iMbWidth ) * MB_WIDTH_LUMA * iRefStride + ( iMbStartIndex % iMbWidth ) * MB_WIDTH_LUMA;
+
+		do 
+		{   
+			pRefTmp = pRefY + iStartSampleIndex;
+			pCurTmp = pSrcY + iStartSampleIndex;
+
+			for ( int32_t i = iMbStartIndex; i < iMbEndIndex; i ++)
+			{
+				m_pfGomSad(&uiGomSad, pGomForegroundBlockNum + j, pVaaCalcResults->pSad8x8[i], pBackgroundMbFlag[i] && !IS_INTRA(uiRefMbType[i]) );
+			}
+
+			iMbStartIndex = iMbEndIndex;
+			iMbEndIndex = WELS_MIN( iMbEndIndex + iMbWidth , iGomMbEndIndex);
+
+			iStartSampleIndex  = ( iMbStartIndex / iMbWidth ) * MB_WIDTH_LUMA * iRefStride + ( iMbStartIndex % iMbWidth ) * MB_WIDTH_LUMA;
+
+		} while ( --iGomMbRowNum );
+
+		pGomComplexity[j] = uiGomSad;
+		uiFrameSad += pGomComplexity[j];
+	}
+
+	m_sComplexityAnalysisParam.iFrameComplexity = uiFrameSad;
+}
+
+
+void CComplexityAnalysis::AnalyzeGomComplexityViaVar( SPixMap *pSrcPixMap, SPixMap *pRefPixMap )
+{
+	int32_t iWidth     = pSrcPixMap->sRect.iRectWidth;
+	int32_t iHeight    = pSrcPixMap->sRect.iRectHeight;	
+	int32_t iMbWidth  = iWidth  >> 4;
+	int32_t iMbHeight = iHeight >> 4;
+	int32_t iMbNum    = iMbWidth * iMbHeight;
+
+	int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+	int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1 ) / iMbNumInGom;
+	int32_t iGomSampleNum = 0;
+
+	int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
+	int32_t iMbStartIndex = 0, iMbEndIndex = 0;
+	int32_t iStartSampleIndex = 0;
+
+	SVAACalcResult *pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+	int32_t  *pGomComplexity = (int32_t *)m_sComplexityAnalysisParam.pGomComplexity;
+
+	uint8_t *pSrcY = NULL;
+	int32_t iCurStride = 0;
+
+	uint8_t *pCurTmp = NULL;
+	uint32_t uiSampleSum = 0, uiSquareSum = 0;
+
+	pSrcY = (uint8_t *)pSrcPixMap->pPixel[0];
+	iCurStride  = pSrcPixMap->iStride[0];
+
+	for ( int32_t j = 0; j < iGomMbNum; j ++ )
+	{
+		uiSampleSum = 0;
+		uiSquareSum = 0;
+
+		iGomMbStartIndex = j * iMbNumInGom;
+		iGomMbEndIndex = WELS_MIN( (j + 1) * iMbNumInGom, iMbNum);
+		iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1 ) / iMbWidth  - iGomMbStartIndex / iMbWidth;
+
+		iMbStartIndex = iGomMbStartIndex;
+		iMbEndIndex = WELS_MIN( (iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
+
+		iStartSampleIndex  = ( iMbStartIndex / iMbWidth ) * MB_WIDTH_LUMA * iCurStride + ( iMbStartIndex % iMbWidth ) * MB_WIDTH_LUMA;
+		iGomSampleNum = (iMbEndIndex - iMbStartIndex) * MB_WIDTH_LUMA * MB_WIDTH_LUMA;
+
+		do 
+		{
+			pCurTmp = pSrcY + iStartSampleIndex;
+
+			for ( int32_t i = iMbStartIndex; i < iMbEndIndex; i ++ )
+			{
+				uiSampleSum += pVaaCalcResults->pSum16x16[i];
+				uiSquareSum += pVaaCalcResults->pSumOfSquare16x16[i];
+			}
+
+			iMbStartIndex = iMbEndIndex;
+			iMbEndIndex = WELS_MIN( iMbEndIndex + iMbWidth, iGomMbEndIndex);
+
+			iStartSampleIndex  = ( iMbStartIndex / iMbWidth ) * MB_WIDTH_LUMA * iCurStride + ( iMbStartIndex % iMbWidth ) * MB_WIDTH_LUMA;
+		} while ( --iGomMbRowNum );
+	
+		pGomComplexity[j] = uiSquareSum - (uiSampleSum * uiSampleSum / iGomSampleNum);
+	}
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/complexityanalysis/ComplexityAnalysis.h
@@ -1,0 +1,83 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+* \file	        :  ComplexityAnalysis.h
+*
+* \brief	    :  complexity analysis class of wels video processor class
+*
+* \date         :  2011/03/28
+*
+* \description  :  1. rewrite the package code of complexity analysis class  
+*
+*************************************************************************************
+*/
+
+#ifndef _WELSVP_COMPLEXITYANALYSIS_H
+#define _WELSVP_COMPLEXITYANALYSIS_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef  void (GOMSadFunc) (uint32_t *pGomSad, int32_t *pGomForegroundBlockNum, int32_t *pSad8x8, uint8_t pBackgroundMbFlag);
+
+typedef GOMSadFunc  * PGOMSadFunc;
+
+GOMSadFunc      GomSampleSad;
+GOMSadFunc      GomSampleSadExceptBackground;
+
+class CComplexityAnalysis : public IStrategy
+{			  
+public:
+	CComplexityAnalysis(int32_t iCpuFlag);
+	~CComplexityAnalysis();
+
+	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pRef);
+	EResult Set(int32_t iType, void *pParam);
+	EResult Get(int32_t iType, void *pParam);
+
+private:
+	void AnalyzeFrameComplexityViaSad(SPixMap *pSrc, SPixMap *pRef);
+	int32_t GetFrameSadExcludeBackground( SPixMap *pSrc, SPixMap *pRef );
+
+	void AnalyzeGomComplexityViaSad(SPixMap *pSrc, SPixMap *pRef);
+	void AnalyzeGomComplexityViaVar(SPixMap *pSrc, SPixMap *pRef);
+
+private:
+	PGOMSadFunc m_pfGomSad;
+	SComplexityAnalysisParam m_sComplexityAnalysisParam;
+};	
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/denoise/denoise.cpp
@@ -1,0 +1,138 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "denoise.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define CALC_BI_STRIDE(iWidth, iBitcount)  ((((iWidth) * (iBitcount) + 31) & ~31) >> 3)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CDenoiser::CDenoiser(int32_t iCpuFlag)
+{
+	m_CPUFlag = iCpuFlag;
+	m_eMethod   = METHOD_DENOISE;
+	WelsMemset(&m_pfDenoise, 0, sizeof(m_pfDenoise));
+
+	m_uiSpaceRadius = DENOISE_GRAY_RADIUS;
+	m_fSigmaGrey  = DENOISE_GRAY_SIGMA;
+	m_uiType		 = DENOISE_ALL_COMPONENT;
+	InitDenoiseFunc(m_pfDenoise, m_CPUFlag);
+}
+
+CDenoiser::~CDenoiser()
+{	
+}
+
+void CDenoiser::InitDenoiseFunc(SDenoiseFuncs &denoiser,  int32_t iCpuFlag)
+{
+		denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_c;
+		denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_c;
+#if defined(X86_ASM)
+	if ( iCpuFlag & WELS_CPU_SSE2 )
+	{
+		denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_sse2;
+		denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_sse2;	
+	}
+#endif
+}
+
+EResult CDenoiser::Process(int32_t iType, SPixMap *pSrc, SPixMap *dst)
+{
+	uint8_t *pSrcY = (uint8_t *)pSrc->pPixel[0];
+	uint8_t *pSrcU = (uint8_t *)pSrc->pPixel[1];
+	uint8_t *pSrcV = (uint8_t *)pSrc->pPixel[2];
+	if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	int32_t iWidthY = pSrc->sRect.iRectWidth;
+	int32_t iHeightY = pSrc->sRect.iRectHeight;
+	int32_t iWidthUV = iWidthY >> 1;
+	int32_t iHeightUV = iHeightY >> 1;
+
+	if(m_uiType & DENOISE_Y_COMPONENT)
+		BilateralDenoiseLuma(pSrcY, iWidthY, iHeightY, pSrc->iStride[0]);
+
+	if(m_uiType & DENOISE_U_COMPONENT)
+		WaverageDenoiseChroma(pSrcU, iWidthUV, iHeightUV, pSrc->iStride[1]);
+
+	if(m_uiType & DENOISE_V_COMPONENT)
+		WaverageDenoiseChroma(pSrcV, iWidthUV, iHeightUV, pSrc->iStride[2]);
+
+	return RET_SUCCESS;
+}
+
+void CDenoiser::BilateralDenoiseLuma(uint8_t * pSrcY, int32_t iWidth, int32_t iHeight, int32_t iStride)
+{
+	int32_t w;
+
+	pSrcY = pSrcY + m_uiSpaceRadius * iStride;
+	for(int32_t h = m_uiSpaceRadius;h < iHeight - m_uiSpaceRadius; h++)
+	{
+		for(w = m_uiSpaceRadius; w < iWidth - m_uiSpaceRadius - TAIL_OF_LINE8; w+=8)
+		{	
+			m_pfDenoise.pfBilateralLumaFilter8(pSrcY + w, iStride);
+		}
+		for(w = w + TAIL_OF_LINE8; w < iWidth - m_uiSpaceRadius; w++)
+		{
+			Gauss3x3Filter(pSrcY + w, iStride);
+		}
+		pSrcY += iStride;
+	}
+}
+
+void CDenoiser::WaverageDenoiseChroma(uint8_t *pSrcUV, int32_t iWidth, int32_t iHeight, int32_t iStride)
+{
+	int32_t w;
+
+	pSrcUV = pSrcUV + UV_WINDOWS_RADIUS * iStride;
+	for(int32_t h = UV_WINDOWS_RADIUS; h < iHeight - UV_WINDOWS_RADIUS; h++)
+	{
+		for( w = UV_WINDOWS_RADIUS; w < iWidth - UV_WINDOWS_RADIUS - TAIL_OF_LINE8; w+=8)
+		{
+			m_pfDenoise.pfWaverageChromaFilter8(pSrcUV + w, iStride);		
+		}
+
+		for(w = w + TAIL_OF_LINE8; w < iWidth - UV_WINDOWS_RADIUS; w++)
+		{
+			Gauss3x3Filter(pSrcUV + w,iStride);
+		}
+		pSrcUV += iStride;
+	}
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/denoise/denoise.h
@@ -1,0 +1,113 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  denoise.h
+ *
+ * \brief	    :  denoise class of wels video processor class
+ *
+ * \date        :  2011/03/15
+ *
+ * \description :  1. rewrite the package code of denoise class  
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_DENOISE_H
+#define _WELSVP_DENOISE_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+
+#define DENOISE_GRAY_RADIUS (1)
+#define DENOISE_GRAY_SIGMA  (2)
+
+#define UV_WINDOWS_RADIUS   (2)
+#define TAIL_OF_LINE8		(7)
+
+#define DENOISE_Y_COMPONENT (1)
+#define DENOISE_U_COMPONENT (2)
+#define DENOISE_V_COMPONENT (4)
+#define DENOISE_ALL_COMPONENT (7)
+
+
+WELSVP_NAMESPACE_BEGIN
+
+void Gauss3x3Filter(uint8_t *pixels, int32_t stride);
+
+typedef void (DenoiseFilterFunc)(uint8_t *pixels, int32_t stride);
+
+typedef DenoiseFilterFunc *DenoiseFilterFuncPtr;
+
+DenoiseFilterFunc     BilateralLumaFilter8_c;
+DenoiseFilterFunc     WaverageChromaFilter8_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+	DenoiseFilterFunc     BilateralLumaFilter8_sse2 ;
+	DenoiseFilterFunc     WaverageChromaFilter8_sse2 ;
+WELSVP_EXTERN_C_END
+#endif
+
+typedef  struct TagDenoiseFuncs 
+{
+	DenoiseFilterFuncPtr	pfBilateralLumaFilter8;//on 8 samples
+	DenoiseFilterFuncPtr	pfWaverageChromaFilter8;//on 8 samples
+} SDenoiseFuncs;
+
+class CDenoiser : public IStrategy
+{			  
+public:
+	CDenoiser(int32_t iCpuFlag);
+	~CDenoiser();
+
+	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *dst);
+
+private:
+	void InitDenoiseFunc(SDenoiseFuncs &pf, int32_t cpu);
+	void BilateralDenoiseLuma(uint8_t * p_y_data, int32_t width, int32_t height, int32_t stride);
+	void WaverageDenoiseChroma(uint8_t *pSrcUV, int32_t width, int32_t height, int32_t stride);
+
+private:
+	float_t	 m_fSigmaGrey;			//sigma for grey scale similarity, suggestion 2.5-3
+	uint32_t  m_uiFilterWindow;				//filter window diameter
+	uint16_t	 m_uiSpaceRadius;			//filter windows radius: 1-3x3, 2-5x5,3-7x7. Larger size, slower speed
+	uint16_t	 m_uiType;					//do denoising on which component 1-Y, 2-U, 4-V; 7-YUV, 3-YU, 5-YV, 6-UV
+	uint32_t  *m_pGreyWeightTable;		//weight table for grey scale
+
+	SDenoiseFuncs m_pfDenoise;
+	int32_t      m_CPUFlag;
+};	
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/denoise/denoise_filter.cpp
@@ -1,0 +1,134 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	svc_preprocess.h
+ *
+ * \brief	svc denoising
+ *
+ * \date	4/1/2010 Created
+ *
+ */
+
+#include "denoise.h"
+#include "../common/typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void	BilateralLumaFilter8_c(uint8_t *pSample, int32_t iStride)
+{
+	int32_t nSum = 0, nTotWeight = 0;
+	int32_t iCenterSample = *pSample;
+	uint8_t * pCurLine = pSample- iStride - DENOISE_GRAY_RADIUS;
+	int32_t x, y;
+	int32_t iCurSample, iCurWeight, iGreyDiff;
+	uint8_t aSample[8];
+
+	for(int32_t i = 0; i < 8; i++)
+	{		
+		nSum = 0;
+		nTotWeight = 0;
+		iCenterSample = *pSample;
+		pCurLine = pSample- iStride - DENOISE_GRAY_RADIUS;
+		for (y = 0; y < 3; y++)
+		{
+			for (x = 0; x < 3; x++) 
+			{
+				if(x == 1 && y == 1) continue;			// except center point
+				iCurSample = pCurLine[x];
+				iCurWeight = WELS_ABS(iCurSample - iCenterSample);
+				iGreyDiff = 32 - iCurWeight;				
+				if(iGreyDiff < 0)	continue; 
+				else iCurWeight = (iGreyDiff * iGreyDiff) >> 5;
+				nSum += iCurSample * iCurWeight;
+				nTotWeight +=  iCurWeight;
+			}
+			pCurLine += iStride;
+		}
+		nTotWeight = 256 - nTotWeight;
+		nSum += iCenterSample * nTotWeight;
+		aSample[i] = nSum >> 8;
+		pSample++;
+	}
+	WelsMemcpy(pSample - 8, aSample, 8);
+}
+
+
+/***************************************************************************
+5x5 filter:
+1	1	2	1	1
+1	2	4	2	1
+2	4	20	4	2
+1	2	4	2	1
+1	1	2	1	1
+***************************************************************************/
+#define SUM_LINE1(pSample)	(pSample[0] +(pSample[1]) +(pSample[2]<<1)  + pSample[3] + pSample[4])
+#define SUM_LINE2(pSample)	(pSample[0] +(pSample[1]<<1) +(pSample[2]<<2)  +(pSample[3]<<1) +pSample[4])
+#define SUM_LINE3(pSample)	((pSample[0]<<1) +(pSample[1]<<2) +(pSample[2]*20)  +(pSample[3]<<2) +(pSample[4]<<1))
+void	WaverageChromaFilter8_c(uint8_t *pSample, int32_t iStride)
+{
+	int32_t sum;
+	uint8_t * pStartPixels = pSample- UV_WINDOWS_RADIUS * iStride - UV_WINDOWS_RADIUS;
+	uint8_t * pCurLine1 = pStartPixels;
+	uint8_t * pCurLine2 = pCurLine1 + iStride;
+	uint8_t * pCurLine3 = pCurLine2 + iStride;
+	uint8_t * pCurLine4 = pCurLine3 + iStride;
+	uint8_t * pCurLine5 = pCurLine4 + iStride;
+	uint8_t aSample[8];
+
+	for(int32_t i = 0; i < 8; i++)
+	{
+		sum = SUM_LINE1((pCurLine1+i)) + SUM_LINE2((pCurLine2+i)) + SUM_LINE3((pCurLine3+i)) 
+			+ SUM_LINE2((pCurLine4+i)) + SUM_LINE1((pCurLine5+i));
+		aSample[i] =  (sum >>6);
+		pSample++;
+	}
+	WelsMemcpy(pSample - 8, aSample, 8);
+}
+
+/***************************************************************************
+edge of y/uv use a 3x3 Gauss filter, radius = 1:
+1	2	1
+2	4	2	
+1	2	1
+***************************************************************************/
+void	Gauss3x3Filter(uint8_t *pSrc, int32_t iStride)
+{
+	int32_t nSum = 0;
+	uint8_t * pCurLine1 = pSrc - iStride - 1;		
+	uint8_t * pCurLine2 = pCurLine1 + iStride;
+	uint8_t * pCurLine3 = pCurLine2 + iStride;
+
+	nSum =	 pCurLine1[0]		+ (pCurLine1[1]<<1) +  pCurLine1[2]		+ 
+			(pCurLine2[0]<<1)	+ (pCurLine2[1]<<2) + (pCurLine2[2]<<1) + 
+			 pCurLine3[0]		+ (pCurLine3[1]<<1) +  pCurLine3[2]; 
+	*pSrc = nSum >> 4;
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/downsample/downsample.cpp
@@ -1,0 +1,145 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "downsample.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CDownsampling::CDownsampling(int32_t iCpuFlag)
+{
+	m_iCPUFlag = iCpuFlag;
+	m_eMethod   = METHOD_DOWNSAMPLE;
+	WelsMemset(&m_pfDownsample, 0, sizeof(m_pfDownsample));
+	InitDownsampleFuncs(m_pfDownsample, m_iCPUFlag);
+}
+
+CDownsampling::~CDownsampling()
+{	
+}
+
+void CDownsampling::InitDownsampleFuncs(SDownsampleFuncs &sDownsampleFunc,  int32_t iCpuFlag)
+{
+	sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
+	sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
+	sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
+	sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
+	sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
+	sDownsampleFunc.pfGeneralRatioLuma	 = GeneralBilinearFastDownsampler_c;
+#if defined(X86_ASM)
+	if ( iCpuFlag & WELS_CPU_SSE )
+	{
+		sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse;
+		sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse;
+		sDownsampleFunc.pfHalfAverage[2]	= DyadicBilinearDownsamplerWidthx8_sse;
+	}
+	if ( iCpuFlag & WELS_CPU_SSE2 )
+	{
+		sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
+		sDownsampleFunc.pfGeneralRatioLuma   = GeneralBilinearFastDownsamplerWrap_sse2;
+	}
+	if ( iCpuFlag & WELS_CPU_SSSE3 )
+	{
+		sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_ssse3;
+		sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_ssse3;
+	}
+	if ( iCpuFlag & WELS_CPU_SSE41 )
+	{
+		sDownsampleFunc.pfHalfAverage[0]	= DyadicBilinearDownsamplerWidthx32_sse4;
+		sDownsampleFunc.pfHalfAverage[1]	= DyadicBilinearDownsamplerWidthx16_sse4;
+	}
+#endif//X86_ASM
+	
+}
+
+EResult CDownsampling::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pDstPixMap)
+{
+	int32_t iSrcWidthY = pSrcPixMap->sRect.iRectWidth;
+	int32_t iSrcHeightY = pSrcPixMap->sRect.iRectHeight;
+	int32_t iDstWidthY = pDstPixMap->sRect.iRectWidth;
+	int32_t iDstHeightY = pDstPixMap->sRect.iRectHeight;
+
+	int32_t iSrcWidthUV = iSrcWidthY >> 1;
+	int32_t iSrcHeightUV = iSrcHeightY >> 1;
+	int32_t iDstWidthUV = iDstWidthY >> 1;
+	int32_t iDstHeightUV = iDstHeightY >> 1;
+
+	if (iSrcWidthY <= iDstWidthY || iSrcHeightY <= iDstHeightY)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	if ((iSrcWidthY >>1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY)
+	{
+		// use half average functions
+		uint8_t iAlignIndex = 3;
+
+		iAlignIndex = GetAlignedIndex(iSrcWidthY);
+		m_pfDownsample.pfHalfAverage[iAlignIndex]((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+		iAlignIndex = GetAlignedIndex(iSrcWidthUV);
+		m_pfDownsample.pfHalfAverage[iAlignIndex]((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+		m_pfDownsample.pfHalfAverage[iAlignIndex]((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+	}
+	else 
+	{
+		m_pfDownsample.pfGeneralRatioLuma((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY, 
+			(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+		m_pfDownsample.pfGeneralRatioChroma((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], iDstWidthUV, iDstHeightUV,
+			(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+
+		m_pfDownsample.pfGeneralRatioChroma((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], iDstWidthUV, iDstHeightUV,
+			(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+	}
+	return RET_SUCCESS;
+}
+
+int32_t CDownsampling::GetAlignedIndex( const int32_t kiSrcWidth )
+{
+	int32_t iAlignIndex = 3;
+	if ( (kiSrcWidth & 0x1f) == 0 )	// x32	
+		iAlignIndex	= 0;
+	else if ( (kiSrcWidth & 0x0f) == 0 )	// x16
+		iAlignIndex	= 1;
+	else if ( (kiSrcWidth & 0x07) == 0 )	// x8
+		iAlignIndex	= 2;
+	else
+		iAlignIndex	= 3;
+	return iAlignIndex;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/downsample/downsample.h
@@ -1,0 +1,126 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  downsample.h
+ *
+ * \brief	    :  downsample class of wels video processor class
+ *
+ * \date        :  2011/03/33
+ *
+ * \description :  1. rewrite the package code of downsample class  
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_DOWNSAMPLE_H
+#define _WELSVP_DOWNSAMPLE_H
+
+#include "../common/util.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+typedef void (HalveDownsampleFunc)(	uint8_t* pDst, const int32_t kiDstStride,
+								   uint8_t* pSrc, const int32_t kiSrcStride,
+								   const int32_t kiSrcWidth, const int32_t kiSrcHeight );
+
+typedef void (GeneralDownsampleFunc)(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+									 uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight );
+
+typedef HalveDownsampleFunc		*PHalveDownsampleFunc;
+typedef GeneralDownsampleFunc	*PGeneralDownsampleFunc;
+
+HalveDownsampleFunc   DyadicBilinearDownsampler_c;
+GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
+
+typedef struct {
+	// align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
+	PHalveDownsampleFunc			pfHalfAverage[4];
+	PGeneralDownsampleFunc		pfGeneralRatioLuma;
+	PGeneralDownsampleFunc		pfGeneralRatioChroma;
+}SDownsampleFuncs;
+
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+// used for scr width is multipler of 8 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx8_sse;
+// iSrcWidth= x16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse;
+// used for scr width is multipler of 16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_ssse3;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_ssse3;
+// iSrcWidth= x16 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx16_sse4;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc		DyadicBilinearDownsamplerWidthx32_sse4;
+
+GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
+
+void GeneralBilinearFastDownsampler_sse2( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+	uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+	const uint32_t kuiScaleX, const uint32_t kuiScaleY );
+void GeneralBilinearAccurateDownsampler_sse2( uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight,
+	uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+	const uint32_t kuiScaleX, const uint32_t kuiScaleY );
+WELSVP_EXTERN_C_END
+#endif
+
+
+
+
+class CDownsampling : public IStrategy
+{			  
+public:
+	CDownsampling(int32_t iCpuFlag);
+	~CDownsampling();
+
+	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst);
+
+private:
+	void InitDownsampleFuncs(SDownsampleFuncs &sDownsampleFunc, int32_t iCpuFlag);
+
+	int32_t GetAlignedIndex( const int32_t kiSrcWidth );
+
+private:
+	SDownsampleFuncs m_pfDownsample;
+	int32_t  m_iCPUFlag;
+};	
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/downsample/downsamplefuncs.cpp
@@ -1,0 +1,241 @@
+/*!
+ * \copy
+ *     Copyright (c)  2008-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  downsample_yuv.c
+ *
+ *  Abstract
+ *      Implementation for source yuv data downsampling used before spatial encoding.
+ *
+ *  History
+ *      10/24/2008 Created
+ *
+ *****************************************************************************/
+
+#include "../common/typedef.h"
+#include "../common/util.h"
+#include "downsample.h"
+
+
+WELSVP_NAMESPACE_BEGIN
+
+
+void DyadicBilinearDownsampler_c( uint8_t* pDst, const int32_t kiDstStride,
+						  uint8_t* pSrc, const int32_t kiSrcStride,
+						  const int32_t kiSrcWidth, const int32_t kiSrcHeight )
+								   
+{
+	uint8_t *pDstLine	= pDst;
+	uint8_t *pSrcLine	= pSrc;
+	const int32_t kiSrcStridex2	= kiSrcStride << 1;
+	const int32_t kiDstWidth		= kiSrcWidth >> 1;
+	const int32_t kiDstHeight	= kiSrcHeight >> 1;
+
+	for( int32_t j = 0; j < kiDstHeight; j ++ )
+	{
+		for( int32_t i = 0; i < kiDstWidth; i ++ )
+		{
+			const int32_t kiSrcX = i<<1;
+			const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX+1] + 1) >> 1;
+			const int32_t kiTempRow2 = (pSrcLine[kiSrcX+kiSrcStride] + pSrcLine[kiSrcX+kiSrcStride+1] + 1) >> 1;
+
+			pDstLine[i] = (uint8_t)((kiTempRow1 + kiTempRow2 + 1) >> 1);
+		}
+		pDstLine	+= kiDstStride;
+		pSrcLine	+= kiSrcStridex2;
+	}	
+}
+
+void GeneralBilinearFastDownsampler_c(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, 
+								uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight )
+{
+	const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
+	const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
+	int32_t fScalex = (int32_t)((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
+	int32_t fScaley = (int32_t)((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
+	uint32_t x;
+	int32_t iYInverse, iXInverse;
+
+	uint8_t* pByDst = pDst;
+	uint8_t* pByLineDst = pDst;
+
+	iYInverse = 1 << (kuiScaleBitHeight - 1);
+	for(int32_t i = 0; i < kiDstHeight - 1; i++)
+	{
+		int32_t iYy = iYInverse >> kuiScaleBitHeight;
+		int32_t fv = iYInverse & (kuiScaleHeight - 1);
+
+		uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+		pByDst = pByLineDst;
+		iXInverse = 1 << (kuiScaleBitWidth - 1);
+		for(int32_t j = 0; j < kiDstWidth - 1; j++)
+		{
+			int32_t iXx = iXInverse >> kuiScaleBitWidth;
+			int32_t iFu = iXInverse & (kuiScaleWidth - 1);
+
+			uint8_t* pByCurrent = pBySrc + iXx;
+			uint8_t a, b, c, d;
+
+			a = *pByCurrent;
+			b = *(pByCurrent + 1 );
+			c = *(pByCurrent + kiSrcStride);
+			d = *(pByCurrent + kiSrcStride + 1 );
+
+			x  = (((uint32_t)(kuiScaleWidth - 1 - iFu))*(kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
+			x += (((uint32_t)(iFu))*(kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
+			x += (((uint32_t)(kuiScaleWidth - 1 - iFu))*(fv) >> kuiScaleBitWidth) * c;
+			x += (((uint32_t)(iFu))*(fv) >> kuiScaleBitWidth) * d;
+			x >>= (kuiScaleBitHeight - 1);
+			x += 1;
+			x >>= 1;
+			//x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c + 
+			//		 ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG);
+			x = WELS_CLAMP(x, 0, 255);
+			*pByDst++ = (uint8_t)x;
+
+			iXInverse += fScalex;
+		}
+		*pByDst = *(pBySrc + (iXInverse >> kuiScaleBitWidth));
+		pByLineDst += kiDstStride;
+		iYInverse += fScaley;
+	}
+
+	// last row special
+	{
+		int32_t iYy = iYInverse >> kuiScaleBitHeight;
+		uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+		pByDst = pByLineDst;
+		iXInverse = 1 << (kuiScaleBitWidth - 1);
+		for(int32_t j = 0; j < kiDstWidth; j++)
+		{
+			int32_t iXx = iXInverse >> kuiScaleBitWidth;
+			*pByDst++ = *(pBySrc + iXx);
+
+			iXInverse += fScalex;
+		}
+	}
+}
+
+void GeneralBilinearAccurateDownsampler_c(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, 
+									uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight )
+{
+	const int32_t kiScaleBit = 15;
+	const int32_t kiScale = (1 << kiScaleBit);
+	int32_t iScalex = (int32_t)((float_t)kiSrcWidth / (float_t)kiDstWidth * kiScale);
+	int32_t iScaley = (int32_t)((float_t)kiSrcHeight / (float_t)kiDstHeight * kiScale);
+	int64_t x;
+	int32_t iYInverse, iXInverse;
+
+	uint8_t* pByDst = pDst;
+	uint8_t* pByLineDst = pDst;
+
+	iYInverse = 1 << (kiScaleBit - 1);
+	for(int32_t i = 0; i < kiDstHeight - 1; i++)
+	{
+		int32_t iYy = iYInverse >> kiScaleBit;
+		int32_t iFv = iYInverse & (kiScale - 1);
+
+		uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+		pByDst = pByLineDst;
+		iXInverse = 1 << (kiScaleBit - 1);
+		for(int32_t j = 0; j < kiDstWidth - 1; j++)
+		{
+			int32_t iXx = iXInverse >> kiScaleBit;
+			int32_t iFu = iXInverse & (kiScale - 1);
+
+			uint8_t* pByCurrent = pBySrc + iXx;
+			uint8_t a, b, c, d;
+
+			a = *pByCurrent;
+			b = *(pByCurrent + 1 );
+			c = *(pByCurrent + kiSrcStride);
+			d = *(pByCurrent + kiSrcStride + 1 );
+
+			x = (((int64_t)(kiScale - 1 - iFu))*(kiScale - 1 - iFv)*a + ((int64_t)iFu)*(kiScale - 1 -iFv)*b + ((int64_t)(kiScale - 1 -iFu))*iFv*c + 
+				((int64_t)iFu)*iFv*d + (int64_t)(1 << (2*kiScaleBit-1)) ) >> (2*kiScaleBit);
+			x = WELS_CLAMP(x, 0, 255);
+			*pByDst++ = (uint8_t)x;
+
+			iXInverse += iScalex;
+		}
+		*pByDst = *(pBySrc + (iXInverse >> kiScaleBit));
+		pByLineDst += kiDstStride;
+		iYInverse += iScaley;
+	}
+
+	// last row special
+	{
+		int32_t iYy = iYInverse >> kiScaleBit;
+		uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+		pByDst = pByLineDst;
+		iXInverse = 1 << (kiScaleBit - 1);
+		for(int32_t j = 0; j < kiDstWidth; j++)
+		{
+			int32_t iXx = iXInverse >> kiScaleBit;
+			*pByDst++ = *(pBySrc + iXx);
+
+			iXInverse += iScalex;
+		}
+	}
+}
+
+
+#ifdef X86_ASM
+void GeneralBilinearFastDownsamplerWrap_sse2(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, 
+						uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight)
+{
+	const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
+	const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
+
+	uint32_t uiScalex = (uint32_t)((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
+	uint32_t uiScaley = (uint32_t)((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
+
+	GeneralBilinearFastDownsampler_sse2(pDst, kiDstStride, kiDstWidth, kiDstHeight, 
+		pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
+}
+
+void GeneralBilinearAccurateDownsamplerWrap_sse2(uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth, const int32_t kiDstHeight, 
+									uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight )
+{
+	const int32_t kiScaleBit = 15;
+	const uint32_t kuiScale = (1 << kiScaleBit);
+
+	uint32_t uiScalex = (uint32_t)((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScale);
+	uint32_t uiScaley = (uint32_t)((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScale);
+
+	GeneralBilinearAccurateDownsampler_sse2(pDst, kiDstStride, kiDstWidth, kiDstHeight, 
+		pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
+}
+#endif //X86_ASM
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/imagerotate/imagerotate.cpp
@@ -1,0 +1,105 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "imagerotate.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CImageRotating::CImageRotating(int32_t iCpuFlag)
+{
+	m_iCPUFlag = iCpuFlag;
+	m_eMethod   = METHOD_IMAGE_ROTATE;
+	WelsMemset(&m_pfRotateImage, 0, sizeof(m_pfRotateImage));
+	InitImageRotateFuncs(m_pfRotateImage, m_iCPUFlag);
+}
+
+CImageRotating::~CImageRotating()
+{	
+}
+
+void CImageRotating::InitImageRotateFuncs(SImageRotateFuncs &sImageRotateFuncs, int32_t iCpuFlag)
+{
+	sImageRotateFuncs.pfImageRotate90D = ImageRotate90D_c;
+	sImageRotateFuncs.pfImageRotate180D = ImageRotate180D_c;
+	sImageRotateFuncs.pfImageRotate270D = ImageRotate270D_c;
+}
+EResult CImageRotating::ProcessImageRotate(int32_t iType, uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst)
+{
+	if (iType == 90)
+	{
+		m_pfRotateImage.pfImageRotate90D(pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+	}
+	else if (iType == 180)
+	{
+		m_pfRotateImage.pfImageRotate180D(pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+	}
+	else if (iType == 270)
+	{
+		m_pfRotateImage.pfImageRotate270D(pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+	}
+	else
+	{	
+		return RET_NOTSUPPORTED;
+	}
+	return RET_SUCCESS;
+}
+
+EResult CImageRotating::Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst)
+{
+	EResult eReturn = RET_INVALIDPARAM;
+
+	if ((pSrc->eFormat == VIDEO_FORMAT_RGBA) ||
+		(pSrc->eFormat == VIDEO_FORMAT_BGRA) ||
+		(pSrc->eFormat == VIDEO_FORMAT_ABGR) ||
+		(pSrc->eFormat == VIDEO_FORMAT_ARGB))
+	{
+		eReturn = ProcessImageRotate(iType, (uint8_t *)pSrc->pPixel[0], pSrc->iSizeInBits*8, pSrc->sRect.iRectWidth, pSrc->sRect.iRectHeight, (uint8_t *)pDst->pPixel[0]);
+	}
+	else if (pSrc->eFormat == VIDEO_FORMAT_I420)
+	{
+		ProcessImageRotate(iType, (uint8_t *)pSrc->pPixel[0], pSrc->iSizeInBits*8, pSrc->sRect.iRectWidth, pSrc->sRect.iRectHeight, (uint8_t *)pDst->pPixel[0]);
+		ProcessImageRotate(iType, (uint8_t *)pSrc->pPixel[1], pSrc->iSizeInBits*8, (pSrc->sRect.iRectWidth >> 1), (pSrc->sRect.iRectHeight >> 1), (uint8_t *)pDst->pPixel[1]);
+		eReturn = ProcessImageRotate(iType, (uint8_t *)pSrc->pPixel[2], pSrc->iSizeInBits*8, (pSrc->sRect.iRectWidth >> 1), (pSrc->sRect.iRectHeight >> 1), (uint8_t *)pDst->pPixel[2]);
+	}
+	else
+	{
+		eReturn = RET_NOTSUPPORTED;
+	}
+
+	return eReturn;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/imagerotate/imagerotate.h
@@ -1,0 +1,84 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  downsample.h
+ *
+ * \brief	    :  image rotate class of wels video processor class
+ *
+ * \date        :  2011/04/06
+ *
+ * \description :  
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_IMAGEROTATE_H
+#define _WELSVP_IMAGEROTATE_H
+
+#include "../common/util.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (ImageRotateFunc)( uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst);
+
+typedef ImageRotateFunc		*ImageRotateFuncPtr;
+
+ImageRotateFunc   ImageRotate90D_c;
+ImageRotateFunc   ImageRotate180D_c;
+ImageRotateFunc   ImageRotate270D_c;
+
+typedef struct {
+	ImageRotateFuncPtr		pfImageRotate90D;
+	ImageRotateFuncPtr		pfImageRotate180D;
+	ImageRotateFuncPtr		pfImageRotate270D;
+}SImageRotateFuncs;
+
+class CImageRotating : public IStrategy
+{			  
+public:
+	CImageRotating(int32_t iCpuFlag);
+	~CImageRotating();
+
+	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pDst);
+
+private:
+	void InitImageRotateFuncs(SImageRotateFuncs &pf, int32_t iCpuFlag);
+	EResult ProcessImageRotate(int32_t iType, uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst);
+
+private:
+	SImageRotateFuncs m_pfRotateImage;
+	int32_t          m_iCPUFlag;
+};	
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/imagerotate/imagerotatefuncs.cpp
@@ -1,0 +1,75 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  image_rotate.c
+ *
+ *  Created on 11-2-21.
+ *
+ */
+
+#include "imagerotate.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void ImageRotate90D_c( uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst)
+{
+	for (uint32_t j=0; j<iHeight; j++) 
+	{	
+		for (uint32_t i=0; i<iWidth; i++) 
+		{
+			for(uint32_t n = 0; n< uiBytesPerPixel; n++)				
+				pDst[(i*iHeight + iHeight-1-j)*uiBytesPerPixel + n] = pSrc[(iWidth*j+i)*uiBytesPerPixel+n];
+		}
+	}
+}
+void ImageRotate180D_c( uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst)
+{
+	for (uint32_t j=0; j<iHeight; j++) 
+	{	
+		for (uint32_t i=0; i<iWidth; i++) 
+		{
+			for(uint32_t n = 0; n< uiBytesPerPixel; n++)
+				pDst[((iHeight-1-j)*iWidth + iWidth-1-i)*uiBytesPerPixel + n] = pSrc[(iWidth*j+i)*uiBytesPerPixel+n];
+		}
+	}
+}
+void ImageRotate270D_c( uint8_t *pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t *pDst)
+{
+	for (uint32_t j=0; j<iWidth; j++) 
+	{	
+		for (uint32_t i=0; i<iHeight; i++) 
+		{
+			for(uint32_t n = 0; n< uiBytesPerPixel; n++)
+				pDst[((iWidth - 1-j)*iHeight + i)*uiBytesPerPixel + n] = pSrc[(iWidth*i+j)*uiBytesPerPixel+n];
+		}
+	}
+}
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/scenechangedetection/SceneChangeDetection.cpp
@@ -1,0 +1,146 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "SceneChangeDetection.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define HIGH_MOTION_BLOCK_THRESHOLD 320
+#define SCENE_CHANGE_MOTION_RATIO	0.85f
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CSceneChangeDetection::CSceneChangeDetection(int32_t iCpuFlag)
+{
+	m_iCpuFlag = iCpuFlag;
+	m_eMethod   = METHOD_SCENE_CHANGE_DETECTION;
+	m_pfSad   = NULL;
+	WelsMemset( &m_sSceneChangeParam, 0, sizeof(m_sSceneChangeParam) );
+	InitSadFuncs(m_pfSad, m_iCpuFlag);
+}
+
+CSceneChangeDetection::~CSceneChangeDetection()
+{	
+}
+
+EResult CSceneChangeDetection::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
+{
+	EResult eReturn = RET_INVALIDPARAM;
+
+	int32_t iWidth                  = pSrcPixMap->sRect.iRectWidth;
+	int32_t iHeight                 = pSrcPixMap->sRect.iRectHeight;	
+	int32_t iBlock8x8Width      = iWidth  >> 3;
+	int32_t iBlock8x8Height	 = iHeight >> 3;
+	int32_t iBlock8x8Num       = iBlock8x8Width * iBlock8x8Height;
+	int32_t iSceneChangeThreshold = WelsStaticCast(int32_t, SCENE_CHANGE_MOTION_RATIO * iBlock8x8Num + 0.5f + PESN);
+
+	int32_t iBlockSad = 0;
+	int32_t iMotionBlockNum = 0;
+
+	uint8_t *pRefY = NULL, *pCurY = NULL;
+	int32_t iRefStride = 0, iCurStride = 0;
+	int32_t iRefRowStride = 0, iCurRowStride = 0;
+
+	uint8_t *pRefTmp = NULL, *pCurTmp = NULL;
+
+	pRefY = (uint8_t *)pRefPixMap->pPixel[0];
+	pCurY = (uint8_t *)pSrcPixMap->pPixel[0];
+
+	iRefStride  = pRefPixMap->iStride[0];
+	iCurStride  = pSrcPixMap->iStride[0];
+
+	iRefRowStride  = pRefPixMap->iStride[0] << 3;
+	iCurRowStride  = pSrcPixMap->iStride[0] << 3;
+
+	m_sSceneChangeParam.bSceneChangeFlag = 0;
+
+	for (int32_t j = 0; j < iBlock8x8Height; j ++ ) 
+	{
+		pRefTmp	= pRefY;
+		pCurTmp 	= pCurY;
+
+		for (int32_t i = 0; i < iBlock8x8Width; i++ )
+		{
+			iBlockSad = m_pfSad(pRefTmp, iRefStride, pCurTmp, iCurStride);
+
+			iMotionBlockNum += (iBlockSad > HIGH_MOTION_BLOCK_THRESHOLD);
+			
+			pRefTmp += 8;
+			pCurTmp += 8;
+		}
+
+		pRefY += iRefRowStride;
+		pCurY += iCurRowStride;
+	}
+
+	if ( iMotionBlockNum >= iSceneChangeThreshold ) 
+	{ 
+		m_sSceneChangeParam.bSceneChangeFlag = 1;
+	}
+
+	eReturn = RET_SUCCESS;
+
+	return eReturn;
+}
+
+
+EResult CSceneChangeDetection::Get(int32_t iType, void *pParam)
+{
+	if (pParam == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	 *(SSceneChangeResult *)pParam = m_sSceneChangeParam;
+
+	return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void CSceneChangeDetection::InitSadFuncs(SadFuncPtr &pfSad,  int32_t iCpuFlag)
+{
+	pfSad = WelsSampleSad8x8_c;
+
+#ifdef X86_ASM	
+	if ( iCpuFlag & WELS_CPU_SSE2 )
+	{
+		pfSad = WelsSampleSad8x8_sse21;
+	}
+#endif
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/scenechangedetection/SceneChangeDetection.h
@@ -1,0 +1,73 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+* \file	        :  SceneChangeDetection.h
+*
+* \brief	    :  scene change detection class of wels video processor class
+*
+* \date         :  2011/03/14
+*
+* \description  :  1. rewrite the package code of scene change detection class  
+*
+*************************************************************************************
+*/
+
+#ifndef _WELSVP_SCENECHANGEDETECTION_H
+#define _WELSVP_SCENECHANGEDETECTION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+#include "SceneChangeDetectionCommon.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+class CSceneChangeDetection : public IStrategy
+{			  
+public:
+	CSceneChangeDetection(int32_t iCpuFlag);
+	~CSceneChangeDetection();
+
+	EResult Process(int32_t iType, SPixMap *pSrc, SPixMap *pRef);
+	EResult Get(int32_t iType, void *pParam);
+
+private:
+	void InitSadFuncs(SadFuncPtr &pfSadFunc, int32_t iCpuFlag);
+
+private:
+	SadFuncPtr m_pfSad;
+	int32_t    m_iCpuFlag;
+	SSceneChangeResult m_sSceneChangeParam;
+};	
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/scenechangedetection/SceneChangeDetectionCommon.cpp
@@ -1,0 +1,62 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "SceneChangeDetectionCommon.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+int32_t WelsSampleSad8x8_c( uint8_t * pSrcY, int32_t iSrcStrideY, uint8_t * pRefY, int32_t iRefStrideY )
+{
+	int32_t iSadSum = 0;
+	uint8_t* pSrcA = pSrcY;
+	uint8_t* pSrcB = pRefY;
+	for (int32_t i = 0; i < 8; i++ )
+	{
+		iSadSum += WELS_ABS( ( pSrcA[0] - pSrcB[0] ) );
+		iSadSum += WELS_ABS( ( pSrcA[1] - pSrcB[1] ) );
+		iSadSum += WELS_ABS( ( pSrcA[2] - pSrcB[2] ) );
+		iSadSum += WELS_ABS( ( pSrcA[3] - pSrcB[3] ) );
+		iSadSum += WELS_ABS( ( pSrcA[4] - pSrcB[4] ) );
+		iSadSum += WELS_ABS( ( pSrcA[5] - pSrcB[5] ) );
+		iSadSum += WELS_ABS( ( pSrcA[6] - pSrcB[6] ) );
+		iSadSum += WELS_ABS( ( pSrcA[7] - pSrcB[7] ) );
+
+		pSrcA += iSrcStrideY;
+		pSrcB += iRefStrideY;
+	}
+
+	return iSadSum;
+} 
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
@@ -1,0 +1,65 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	        :  SceneChangeDetectionCommon.h
+ *
+ * \brief	    :  scene change detection class of wels video processor class
+ *
+ * \date         :  2011/03/14
+ *
+ * \description  :  1. rewrite the package code of scene change detection class  
+ *
+ */
+
+#ifndef _WELSVP_SCENECHANGEDETECTIONCOMMON_H
+#define _WELSVP_SCENECHANGEDETECTIONCOMMON_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef  int32_t (SadFunc) ( uint8_t * pSrcY, int32_t iSrcStrideY, uint8_t * pRefY, int32_t iRefStrideY );
+
+typedef SadFunc  * SadFuncPtr;
+
+SadFunc      WelsSampleSad8x8_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+SadFunc      WelsSampleSad8x8_sse21;
+WELSVP_EXTERN_C_END
+#endif
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/processing/src/testbed/WelsVideoProcessor.cpp
@@ -1,0 +1,464 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// WelsVideoProcessor.cpp : Defines the entry point for the console application.
+//
+
+#include <tchar.h>
+#include "stdafx.h"
+#include "wels_process.h"
+
+//////////////////////////////////////////////////////////////////////////
+typedef struct
+{
+	FILE    *srcfile;
+	FILE    *dstfile;
+	vPixMap  src;
+	vPixMap  dst;
+	vMethods methods[vMethods_Mask];
+} VpConfigure;
+//////////////////////////////////////////////////////////////////////////
+
+void PrintHelp(TCHAR *strAppName, TCHAR *strError)
+{
+	if (strError)
+	{
+		_tprintf(_T("Error: %s\n"), strError);
+	} 
+	else 
+	{
+		_tprintf(_T("Welsvp Sample Console\n"));
+	}
+
+	_tprintf(_T("Usage1: %s [Options] -i InputFile -o OutputFile -w 640 -h 480\n"), strAppName);
+	_tprintf(_T("Options: \n"));
+
+	_tprintf(_T("   [-sx  x]       - cropX  of src video (def: 0)\n"));
+	_tprintf(_T("   [-sy  y]       - cropY  of src video (def: 0)\n"));
+	_tprintf(_T("   [-sw  width]   - cropW  of src video (def: width)\n"));
+	_tprintf(_T("   [-sh  height]  - cropH  of src video (def: height)\n"));
+	_tprintf(_T("   [-scc format]  - format (FourCC) of src video (def: support yv12|yuy2|rgb3|rgb4)\n"));
+
+	_tprintf(_T("   [-dx  x]       - cropX  of dst video (def: 0)\n"));
+	_tprintf(_T("   [-dy  y]       - cropY  of dst video (def: 0)\n"));
+	_tprintf(_T("   [-dw  width]   - cropW  of dst video (def: width)\n"));
+	_tprintf(_T("   [-dh  height]  - cropH  of dst video (def: height)\n"));
+	_tprintf(_T("   [-dcc format]  - format (FourCC) of dst video (def: nv12. support nv12|yuy2)\n"));
+
+	_tprintf(_T("   Video Processing Algorithms\n"));
+	_tprintf(_T("   [-vaa]         - enable video analysis algorithm \n"));
+	_tprintf(_T("   [-bgd]         - enable background detection algorithm \n"));
+	_tprintf(_T("   [-scd]         - enable scene change detection algorithm \n"));
+	_tprintf(_T("   [-denoise]     - enable denoise algorithm \n"));
+	_tprintf(_T("   [-downsample]  - enable downsample algorithm \n"));
+
+	_tprintf(_T("   [-n frames]    - number of frames to VP process\n\n"));
+	_tprintf(_T("\n"));
+
+	_tprintf(_T("Usage2: %s -sw 640 -sh 480 -scc rgb3 -dw 320 -dh 240 -dcc i420 -denoise -vaa -i in.rgb -o out.yuv\n"), strAppName);
+	_tprintf(_T("\n"));
+} 
+
+vVideoFormat Str2FourCC( TCHAR* strInput )
+{
+	vVideoFormat format = vVideoFormat_I420; // as default
+
+	if ( 0 == _tcscmp(strInput, _T("yv12")) ) 
+	{
+		format = vVideoFormat_YV12;
+	} 
+	else if ( 0 == _tcscmp(strInput, _T("i420")) ) 
+	{
+		format = vVideoFormat_I420;
+	} 
+	else if ( 0 == _tcscmp(strInput, _T("rgb24")) ) 
+	{
+		format = vVideoFormat_RGB24;
+	} 
+	else if ( 0 == _tcscmp(strInput, _T("rgb32")) ) 
+	{
+		format = vVideoFormat_RGB32;
+	} 
+	else if ( 0 == _tcscmp(strInput, _T("yuy2")) )
+	{
+		format = vVideoFormat_YUY2;
+	} 
+	else if ( 0 == _tcscmp(strInput, _T("nv12")) ) 
+	{
+		format = vVideoFormat_NV12;
+	} 
+
+	return format;
+}
+
+int ReadFile(vPixMap &pixmap, FILE *fp)
+{
+	int ret = 0;
+
+	int size = pixmap.Rect.width * pixmap.Rect.height;
+	switch (pixmap.eFormat)
+	{
+	case vVideoFormat_I420:
+	case vVideoFormat_YV12:
+		{
+			if ( fread(pixmap.pPixel[0], pixmap.nSizeInBits/8, (3*size)>>1, fp) <= 0 )
+				ret = 1;		
+		}
+		break;
+	case vVideoFormat_YUY2:
+		{
+			if ( fread(pixmap.pPixel[0], pixmap.nSizeInBits/8, 2*size, fp) <= 0 )
+				ret = 1;
+		}
+		break;
+	case vVideoFormat_RGB24:
+		{
+			if ( fread(pixmap.pPixel[0], pixmap.nSizeInBits/8, 3*size, fp) <= 0 )
+				ret = 1;
+		}
+		break;
+	case vVideoFormat_RGB32:
+		{
+			if ( fread(pixmap.pPixel[0], pixmap.nSizeInBits/8, 4*size, fp) <= 0 )
+				ret = 1;
+		}
+		break;
+	default:
+		ret = 1;
+		break;
+	}
+	return ret;
+}
+
+int WriteFile(vPixMap &pixmap, FILE *fp)
+{
+	int ret = 0;
+	int size = pixmap.Rect.width * pixmap.Rect.height;
+	switch (pixmap.eFormat)
+	{
+	case vVideoFormat_I420:
+	case vVideoFormat_YV12:
+		{
+			if ( fwrite(pixmap.pPixel[0], pixmap.nSizeInBits/8, (3*size)>>1, fp) <= 0 )
+				ret = 1;		
+		}
+		break;
+	case vVideoFormat_YUY2:
+		{
+			if ( fwrite(pixmap.pPixel[0], pixmap.nSizeInBits/8, 2*size, fp) <= 0 )
+				ret = 1;
+		}
+		break;
+	case vVideoFormat_RGB24:
+		{
+			if ( fwrite(pixmap.pPixel[0], pixmap.nSizeInBits/8, 3*size, fp) <= 0 )
+				ret = 1;
+		}
+		break;
+	case vVideoFormat_RGB32:
+		{
+			if ( fwrite(pixmap.pPixel[0], pixmap.nSizeInBits/8, 4*size, fp) <= 0 )
+				ret = 1;
+		}
+		break;
+	default:
+		ret = 1;
+		break;
+	}
+	return ret;
+}
+
+
+int AllocPixMap(vPixMap &pixmap)
+{
+	pixmap.nSizeInBits = sizeof(unsigned char) * 8;
+
+	switch (pixmap.eFormat)
+	{
+	case vVideoFormat_I420:
+	case vVideoFormat_YV12:
+		{
+			pixmap.nStride[0]  = pixmap.Rect.width;
+			pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width / 2;
+			pixmap.pPixel[0]   = new void *[pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 3 / 2];
+			pixmap.pPixel[1]   = (unsigned char *)pixmap.pPixel[0] + pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8;
+			pixmap.pPixel[2]   = (unsigned char *)pixmap.pPixel[0] + pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 5 / 4;
+		}
+		break;
+
+	case vVideoFormat_YUY2:
+		{
+			pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 2;
+			pixmap.pPixel[0]   = new void *[pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 2];
+			pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
+		}
+		break;
+
+	case vVideoFormat_RGB24:
+		{
+			pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 3;
+			pixmap.pPixel[0]   = new void *[pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 3];
+			pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
+		}
+		break;
+
+	case vVideoFormat_RGB32:
+		{
+			pixmap.nStride[0]  = pixmap.nStride[1]  = pixmap.nStride[2]  = pixmap.Rect.width * 4;
+			pixmap.pPixel[0]   = new void *[pixmap.nStride[0] * pixmap.Rect.height * pixmap.nSizeInBits / 8 * 4];
+			pixmap.pPixel[1]   = pixmap.pPixel[2] = NULL;
+		}
+		break;
+
+	default:
+		return 1;
+	}
+	
+	return (pixmap.pPixel[0]) ? 0 : 1;
+}
+
+void FreePixMap(vPixMap &pixmap)
+{
+	if (pixmap.pPixel[0])
+	{
+		free(pixmap.pPixel[0]);
+		pixmap.pPixel[0] = pixmap.pPixel[1] = pixmap.pPixel[2] = NULL;
+	}
+}
+
+int InitResource(TCHAR *strAppName, VpConfigure &cfg)
+{
+	if (0 == cfg.srcfile) 
+	{
+		PrintHelp(strAppName, _T("Source file can not found!\n"));
+		goto exit;
+	};
+
+	if (0 == cfg.dstfile) 
+	{
+		PrintHelp(strAppName, _T("Destination file name not found"));
+		goto exit;
+	};
+
+	if (cfg.dst.Rect.width == 0)
+		cfg.dst.Rect.width = cfg.src.Rect.width;
+	if (cfg.dst.Rect.height == 0)
+		cfg.dst.Rect.height = cfg.src.Rect.height;
+
+	cfg.methods[vMethods_ColorSpaceConvert] = vMethods_ColorSpaceConvert;
+
+	if (AllocPixMap(cfg.src))
+		goto exit;
+
+	if (AllocPixMap(cfg.dst))
+		goto exit;
+
+	return 0;
+
+exit:
+	FreePixMap(cfg.src);
+	FreePixMap(cfg.dst);
+	return 1;	
+}
+
+int ParseCommond(TCHAR* strInput[], int nArgNum, VpConfigure &cfg)
+{
+	if (nArgNum < 9)
+	{
+		PrintHelp(strInput[0], _T("please specify all necessary parameters!"));
+		return 1;
+	}
+
+	int width = 0, height = 0;
+	for (int i = 1; i < nArgNum; i++ ) 
+	{
+		if (strInput[i])
+		{ 
+			if ( 0 == _tcscmp(strInput[i], _T("-i")) ) 
+			{          
+				i++;
+				_tfopen_s(&cfg.srcfile, strInput[i], _T("rb"));
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-o")))
+			{
+				i++;
+				_tfopen_s(&cfg.dstfile, strInput[i], _T("wb"));
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-w")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%d"), &width);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-h")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%d"), &height);
+			}
+            //-----------------------------------------------------------------------------------
+			else if (0 == _tcscmp(strInput[i], _T("-sx")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%hd"), &cfg.src.Rect.top);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-sy")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%hd"), &cfg.src.Rect.left);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-sw")))
+			{
+				i++;
+				TCHAR *a = strInput[i];
+				_stscanf_s(strInput[i], _T("%hd"), &cfg.src.Rect.width);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-sh")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%hd"), &cfg.src.Rect.height);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-scc")))
+			{
+				i++;
+				cfg.src.eFormat = Str2FourCC( strInput[i] );
+			}
+            //-----------------------------------------------------------------------------------
+			else if (0 == _tcscmp(strInput[i], _T("-dx")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%hd"), &cfg.dst.Rect.top);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-dy")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%hd"), &cfg.dst.Rect.left);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-dw")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%hd"), &cfg.dst.Rect.width);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-dh")))
+			{
+				i++;
+				_stscanf_s(strInput[i], _T("%hd"), &cfg.dst.Rect.height);
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-dcc")))
+			{
+				i++;
+				cfg.dst.eFormat = Str2FourCC( strInput[i] );
+			}
+			//-----------------------------------------------------------------------------------
+			else if (0 == _tcscmp(strInput[i], _T("-denoise")))
+			{
+				cfg.methods[vMethods_Denoise] = vMethods_Denoise;
+			}	
+			else if (0 == _tcscmp(strInput[i], _T("-scd")))
+			{
+				cfg.methods[vMethods_SceneChangeDetection] = vMethods_SceneChangeDetection;
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-downsample")))
+			{
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-vaa")))
+			{
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-bgd")))
+			{
+			}
+			else if (0 == _tcscmp(strInput[i], _T("-aq")))
+			{
+			}
+		}
+	}	
+
+	if (cfg.src.Rect.width == 0)  cfg.src.Rect.width  = width;
+	if (cfg.src.Rect.height == 0) cfg.src.Rect.height = height;
+	if (cfg.dst.Rect.width == 0)  cfg.dst.Rect.width  = width;
+	if (cfg.dst.Rect.height == 0) cfg.dst.Rect.height = height;
+
+	return InitResource(strInput[0], cfg);
+}
+
+int _tmain(int argc, _TCHAR* argv[])
+{
+	int   ret           = 0;
+	VpConfigure cfg     = {0};
+	IWelsVpPlugin *pVpp = NULL;
+
+	ret = ParseCommond(argv, argc, cfg);
+	if (ret)
+		goto exit;
+
+	pVpp = new IWelsVpPlugin(ret);
+	if (pVpp && ret == 0)
+	{
+		vResult vret = vRet_Success;
+		while (1)
+		{
+			if (feof(cfg.srcfile))
+				break;
+
+			if (ReadFile(cfg.src, cfg.srcfile))
+				break;
+
+			vret = pVpp->Process(cfg.methods[vMethods_ColorSpaceConvert], &cfg.src, &cfg.dst);
+			if (vret)
+				break;
+
+			vret = pVpp->Process(cfg.methods[vMethods_Denoise], &cfg.dst, NULL);
+			if (vret)
+				break;
+
+			if (WriteFile(cfg.dst, cfg.dstfile))
+				break;
+		}		
+	}
+
+exit:
+
+	if (pVpp)
+	{
+		delete pVpp;
+		pVpp = NULL;
+	}
+
+	if (cfg.srcfile)
+		fclose(cfg.srcfile);
+	if (cfg.dstfile)
+		fclose(cfg.dstfile);
+
+	FreePixMap(cfg.src);
+	FreePixMap(cfg.dst);	
+
+	return 0;
+}
+
--- /dev/null
+++ b/processing/src/testbed/bundleloader.h
@@ -1,0 +1,95 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef WELS_BOUNDLELOAD_H
+#define WELS_BOUNDLELOAD_H
+
+#if defined(MACOS)
+
+#include <dlfcn.h>
+#include <string>
+
+CFBundleRef LoadBundle(const char* lpBundlePath)
+{
+	if(lpBundlePath == NULL)
+	{
+		return NULL;
+	}
+	CFStringRef bundlePath = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpBundlePath, CFStringGetSystemEncoding());
+	if(NULL == bundlePath)
+	{
+		return NULL;
+	}
+
+	CFURLRef bundleURL = CFURLCreateWithString(kCFAllocatorSystemDefault, bundlePath, NULL);
+	if(NULL == bundleURL)
+	{
+		return NULL;
+	}
+
+	// 2.get bundle ref
+	CFBundleRef bundleRef = CFBundleCreate(kCFAllocatorSystemDefault, bundleURL);
+	CFRelease(bundleURL);
+
+	//	Boolean bReturn = FALSE;
+	if(NULL != bundleRef)
+	{
+		//	bReturn = CFBundleLoadExecutable(bundleRef);
+	}
+
+	return bundleRef;
+}
+
+Boolean FreeBundle(CFBundleRef bundleRef)
+{
+	if(NULL != bundleRef)
+	{
+		//	CFBundleUnloadExecutable(bundleRef);
+		CFRelease(bundleRef);
+	}
+	return TRUE;
+}
+
+void* GetProcessAddress(CFBundleRef bundleRef, const char* lpProcName)
+{
+	void *processAddress = NULL;
+	if(NULL != bundleRef)
+	{
+		CFStringRef cfProcName = CFStringCreateWithCString(kCFAllocatorSystemDefault, lpProcName, CFStringGetSystemEncoding());
+		processAddress = CFBundleGetFunctionPointerForName(bundleRef, cfProcName);
+		CFRelease(cfProcName);
+	}
+	return processAddress;
+}
+#endif
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/processing/src/testbed/stdafx.cpp
@@ -1,0 +1,8 @@
+// stdafx.cpp : source file that includes just the standard includes
+// WelsVideoProcessor.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
--- /dev/null
+++ b/processing/src/testbed/stdafx.h
@@ -1,0 +1,20 @@
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#ifndef _WELSVP_STDAFX_H
+#define _WELSVP_STDAFX_H
+
+#include "targetver.h"
+
+#if defined (WIN32)
+#include <windows.h>
+#include <tchar.h>
+#else
+#include <string.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+#endif
--- /dev/null
+++ b/processing/src/testbed/targetver.h
@@ -1,0 +1,16 @@
+#ifndef _WELSVP_TARGETVER_H
+#define _WELSVP_TARGETVER_H
+
+// The following macros define the minimum required platform.  The minimum required platform
+// is the earliest version of Windows, Internet Explorer etc. that has the necessary features to run 
+// your application.  The macros work by enabling all features available on platform versions up to and 
+// including the version specified.
+
+// Modify the following defines if you have to target a platform prior to the ones specified below.
+// Refer to MSDN for the latest info on corresponding values for different platforms.
+#ifndef _WIN32_WINNT            // Specifies that the minimum required platform is Windows Vista.
+#define _WIN32_WINNT 0x0600     // Change this to the appropriate value to target other versions of Windows.
+#endif
+
+#endif
+
--- /dev/null
+++ b/processing/src/testbed/wels_process.cpp
@@ -1,0 +1,195 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <windows.h>
+#include "wels_process.h"
+#include "bundleloader.h"
+
+// entry API declaration
+typedef vResult (WELSAPI *pfnCreateVpInterface)  (void **, int );
+typedef vResult (WELSAPI *pfnDestroyVpInterface) (void * , int );
+
+////////////////////////////////////////////////////////
+void *loadlib()
+{
+#if defined(WIN32)
+	HMODULE shModule = LoadLibraryA("WelsVP.dll");
+	if (shModule == NULL)
+		shModule = LoadLibraryA("../WelsVP.dll");
+
+#elif defined(MACOS)
+	const char WelsVPLib[] = "WelsVP.bundle";
+	CFBundleRef shModule = LoadBundle(WelsVPLib);
+
+#elif defined(UNIX)
+	const char WelsVPLib[] = "WelsVP.so";
+	void* shModule = dlopen(WelsVPLib, RTLD_LAZY);
+#endif
+
+	return (void *)shModule;
+}
+
+void freelib(void *lib)
+{
+	if (lib)
+	{
+#ifdef WIN32
+		HMODULE shModule = (HMODULE)lib;
+		FreeLibrary(shModule);
+
+#elif defined(MACOS)
+		CFBundleRef shModule = (CFBundleRef)lib;
+		FreeBundle(shModule);
+
+#elif defined(UNIX)
+		void* shModule = lib;
+		dlclose(shModule);
+#endif
+	}
+}
+
+void *queryfunc(void *lib, const char *name)
+{
+    void *pFunc = NULL;
+#ifdef WIN32
+	HMODULE shModule = (HMODULE)lib;
+	pFunc = (void *)GetProcAddress(shModule, name);
+#elif defined(MACOS)
+	CFBundleRef shModule = (CFBundleRef)lib;
+	pFunc = (void *)GetProcessAddress(shModule, name);
+#elif defined(UNIX)
+	void* shModule = lib;
+	pFunc = (void *)dlsym(shModule, name);
+#endif
+
+	return pFunc;
+}
+
+IWelsVpPlugin::IWelsVpPlugin(int &ret)
+: flag(0)
+, ivp(NULL)
+, hlib(NULL)
+{
+	pfnCreateVpInterface  pCreateVpInterface  = NULL;
+	pfnDestroyVpInterface pDestroyVpInterface = NULL;
+	iface[0] = iface[1] = NULL;
+
+	hlib  = loadlib();
+	if (!hlib)
+		goto exit;
+
+	pCreateVpInterface  = (pfnCreateVpInterface)  queryfunc(hlib, ("CreateVpInterface"));
+	pDestroyVpInterface = (pfnDestroyVpInterface) queryfunc(hlib, ("DestroyVpInterface"));
+	if (!pCreateVpInterface || !pDestroyVpInterface)
+		goto exit;
+    
+	iface[0] = (void *) pCreateVpInterface;
+	iface[1] = (void *) pDestroyVpInterface;
+	pCreateVpInterface((void **)&ivp, WELSVP_INTERFACE_VERION);
+	if (!iface)
+		goto exit;
+
+	ret = 0;
+	return;
+
+exit:
+	ret = 1;
+}
+
+IWelsVpPlugin::~IWelsVpPlugin()
+{
+	if (hlib)
+	{
+		pfnDestroyVpInterface pDestroyVpInterface = (pfnDestroyVpInterface) iface[1];
+		if (pDestroyVpInterface)
+			pDestroyVpInterface((void *)ivp, WELSVP_INTERFACE_VERION);
+
+		freelib(hlib);
+		hlib = NULL;
+	}
+}
+
+vResult IWelsVpPlugin::Init (int nType, void *pCfg)
+{
+	vResult ret = vRet_NotSupport;
+	if (hlib && nType > 0)
+		ret = ivp->Init(nType, pCfg);
+	return ret;
+}
+
+vResult IWelsVpPlugin::Uninit (int nType)
+{
+	vResult ret = vRet_NotSupport;
+	if (hlib && nType > 0)
+		ret = ivp->Uninit(nType);
+	return ret; 
+}
+
+vResult IWelsVpPlugin::Flush (int nType)
+{
+	vResult ret = vRet_NotSupport;
+	if (hlib && nType > 0)
+		ret = ivp->Flush(nType);
+	return ret; 	
+}
+
+vResult IWelsVpPlugin::Process (int nType, vPixMap *src, vPixMap *dst)
+{
+	vResult ret = vRet_NotSupport;
+	if (hlib && nType > 0)
+		ret = ivp->Process(nType, src, dst);
+	return ret; 
+}
+
+vResult IWelsVpPlugin::Get (int nType, void *pParam)
+{
+	vResult ret = vRet_NotSupport;
+	if (hlib && nType > 0)
+		ret = ivp->Get(nType, pParam);
+	return ret; 
+}
+
+vResult IWelsVpPlugin::Set (int nType, void *pParam)
+{
+	vResult ret = vRet_NotSupport;
+	if (hlib && nType > 0)
+		ret = ivp->Set(nType, pParam);
+	return ret; 
+}
+
+vResult IWelsVpPlugin::SpecialFeature (int nType, void *pIn, void *pOut)
+{
+	vResult ret = vRet_NotSupport;
+	if (hlib && nType > 0)
+		ret = ivp->SpecialFeature(nType, pIn, pOut);
+	return ret; 
+}
\ No newline at end of file
--- /dev/null
+++ b/processing/src/testbed/wels_process.h
@@ -1,0 +1,79 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	wels_process.h
+ *
+ * \brief	interface of video pre-process plugins
+ *
+ * \date	03/21/2011
+ *
+ * \description : this class is designed as an interface to unify video pre-processing 
+ *                class implement sets such as denoise,colorspace conversion etc...
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELS_PREPROCESS_H
+#define WELS_PREPROCESS_H
+
+#include "../../interface/IWelsVP.h"
+
+class IWelsVpPlugin
+{
+public:
+	IWelsVpPlugin(int &ret);
+	~IWelsVpPlugin();
+
+	enum
+	{
+		STATE_BEFOREENC = 0, /* before picture encoding */
+		STATE_AFTERENC     , /* after picture encoded */
+	};
+
+public:
+	vResult Init    (int nType, void *pCfg); 
+	vResult Uninit  (int nType);
+	vResult Flush   (int nType);
+	vResult Process (int nType, vPixMap *src, vPixMap *dst); 
+	vResult Get     (int nType, void *pParam); 
+	vResult Set     (int nType, void *pParam); 
+	vResult SpecialFeature (int nType, void *pIn, void *pOut);
+
+	void SetFlag(int a)   { flag = a; }
+	void GetFlag(int &a)  { a = flag; }
+
+private:
+	int      flag;
+	IWelsVP  *ivp;	
+	void     *hlib;
+	void     *iface[2];
+};
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/processing/src/vaacalc/vaacalcfuncs.cpp
@@ -1,0 +1,655 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "../common/typedef.h"
+#include "../common/util.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void VAACalcSadSsd_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
+						int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+{
+	uint8_t *tmp_ref = pRefData;
+	uint8_t *tmp_cur = pCurData;
+	int32_t iMbWidth = (iPicWidth >> 4);
+	int32_t mb_heigth = (iPicHeight >> 4);
+	int32_t mb_index = 0;
+	int32_t pic_stride_x8 = iPicStride << 3;
+	int32_t step = (iPicStride << 4) - iPicWidth;
+
+	*pFrameSad = 0;
+	for (int32_t i = 0; i < mb_heigth; i ++)
+	{
+		for (int32_t j = 0; j < iMbWidth; j ++)
+		{
+			int32_t k, l;
+			int32_t l_sad, l_sqdiff, l_sum, l_sqsum;
+			uint8_t *tmp_cur_row;
+			uint8_t *tmp_ref_row;
+
+			pSum16x16[mb_index] = 0;
+			psqsum16x16[mb_index] = 0;
+			psqdiff16x16[mb_index] = 0;
+			
+			l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur;
+			tmp_ref_row = tmp_ref;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+					l_sqdiff += diff*diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 0] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			psqdiff16x16[mb_index] += l_sqdiff;
+
+			l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + 8;
+			tmp_ref_row = tmp_ref + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+					l_sqdiff += diff*diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 1] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			psqdiff16x16[mb_index] += l_sqdiff;
+
+			l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + pic_stride_x8;
+			tmp_ref_row = tmp_ref + pic_stride_x8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+					l_sqdiff += diff*diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 2] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			psqdiff16x16[mb_index] += l_sqdiff;
+			
+			l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+					l_sqdiff += diff*diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 3] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			psqdiff16x16[mb_index] += l_sqdiff;
+			
+			
+			tmp_ref += 16;
+			tmp_cur += 16;
+			++mb_index;
+		}
+		tmp_ref += step;
+		tmp_cur += step;
+	}
+}
+void VAACalcSadVar_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
+						int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *psqsum16x16)
+{
+	uint8_t *tmp_ref = pRefData;
+	uint8_t *tmp_cur = pCurData;
+	int32_t iMbWidth = (iPicWidth >> 4);
+	int32_t mb_heigth = (iPicHeight >> 4);
+	int32_t mb_index = 0;
+	int32_t pic_stride_x8 = iPicStride << 3;
+	int32_t step = (iPicStride << 4) - iPicWidth;
+
+	*pFrameSad = 0;
+	for (int32_t i = 0; i < mb_heigth; i ++)
+	{
+		for (int32_t j = 0; j < iMbWidth; j ++)
+		{
+			int32_t k, l;
+			int32_t l_sad, l_sum, l_sqsum;
+			uint8_t *tmp_cur_row;
+			uint8_t *tmp_ref_row;
+
+			pSum16x16[mb_index] = 0;
+			psqsum16x16[mb_index] = 0;
+			
+			l_sad =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur;
+			tmp_ref_row = tmp_ref;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 0] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+
+			l_sad =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + 8;
+			tmp_ref_row = tmp_ref + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 1] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+
+			l_sad =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + pic_stride_x8;
+			tmp_ref_row = tmp_ref + pic_stride_x8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 2] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			
+			l_sad =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l]*tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 3] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			
+			
+			tmp_ref += 16;
+			tmp_cur += 16;
+			++mb_index;
+		}
+		tmp_ref += step;
+		tmp_cur += step;
+	}
+}
+
+
+void VAACalcSad_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
+						int32_t *pFrameSad, int32_t *pSad8x8)
+{
+	uint8_t *tmp_ref = pRefData;
+	uint8_t *tmp_cur = pCurData;
+	int32_t iMbWidth = (iPicWidth >> 4);
+	int32_t mb_heigth = (iPicHeight >> 4);
+	int32_t mb_index = 0;
+	int32_t pic_stride_x8 = iPicStride << 3;
+	int32_t step = (iPicStride << 4) - iPicWidth;
+
+	*pFrameSad = 0;
+	for (int32_t i = 0; i < mb_heigth; i ++)
+	{
+		for (int32_t j = 0; j < iMbWidth; j ++)
+		{
+			int32_t k, l;
+			int32_t l_sad;
+			uint8_t *tmp_cur_row;
+			uint8_t *tmp_ref_row;
+			
+			l_sad =  0;
+			tmp_cur_row = tmp_cur;
+			tmp_ref_row = tmp_ref;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 0] = l_sad;
+
+			l_sad =  0;
+			tmp_cur_row = tmp_cur + 8;
+			tmp_ref_row = tmp_ref + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 1] = l_sad;
+
+			l_sad =  0;
+			tmp_cur_row = tmp_cur + pic_stride_x8;
+			tmp_ref_row = tmp_ref + pic_stride_x8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 2] = l_sad;
+			
+			l_sad =  0;
+			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = WELS_ABS(tmp_cur_row[l] - tmp_ref_row[l]);
+					l_sad += diff;
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 3] = l_sad;
+			
+			tmp_ref += 16;
+			tmp_cur += 16;
+			++mb_index;
+		}
+		tmp_ref += step;
+		tmp_cur += step;
+	}
+}
+
+void VAACalcSadSsdBgd_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
+							int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *pSd8x8, uint8_t *pMad8x8)
+
+{
+	uint8_t *tmp_ref = pRefData;
+	uint8_t *tmp_cur = pCurData;
+	int32_t iMbWidth = (iPicWidth >> 4);
+	int32_t mb_heigth = (iPicHeight >> 4);
+	int32_t mb_index = 0;
+	int32_t pic_stride_x8 = iPicStride << 3;
+	int32_t step = (iPicStride << 4) - iPicWidth;
+
+	*pFrameSad = 0;
+	for (int32_t i = 0; i < mb_heigth; i ++)
+	{
+		for (int32_t j = 0; j < iMbWidth; j ++)
+		{
+			int32_t k, l;
+			int32_t l_sad, l_sqdiff, l_sum, l_sqsum, l_sd, l_mad;
+			uint8_t *tmp_cur_row;
+			uint8_t *tmp_ref_row;
+
+			pSum16x16[mb_index] = 0;
+			psqsum16x16[mb_index] = 0;
+			psqdiff16x16[mb_index] = 0;
+
+			l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur;
+			tmp_ref_row = tmp_ref;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+					int32_t abs_diff = WELS_ABS(diff);
+
+					l_sd += diff;
+					if (abs_diff>l_mad)
+					{
+						l_mad = abs_diff;
+					}
+					l_sad += abs_diff;
+					l_sqdiff += abs_diff*abs_diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 0] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			psqdiff16x16[mb_index] += l_sqdiff;
+			pSd8x8[(mb_index << 2) + 0] = l_sd;
+			pMad8x8[(mb_index << 2) + 0] = l_mad;
+
+
+			l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + 8;
+			tmp_ref_row = tmp_ref + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+					int32_t abs_diff = WELS_ABS(diff);
+
+					l_sd += diff;
+					if (abs_diff>l_mad)
+					{
+						l_mad = abs_diff;
+					}
+					l_sad += abs_diff;
+					l_sqdiff += abs_diff*abs_diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 1] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			psqdiff16x16[mb_index] += l_sqdiff;
+			pSd8x8[(mb_index << 2) + 1] = l_sd;
+			pMad8x8[(mb_index << 2) + 1] = l_mad;
+
+			l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + pic_stride_x8;
+			tmp_ref_row = tmp_ref + pic_stride_x8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+					int32_t abs_diff = WELS_ABS(diff);
+
+					l_sd += diff;
+					if (abs_diff>l_mad)
+					{
+						l_mad = abs_diff;
+					}
+					l_sad += abs_diff;
+					l_sqdiff += abs_diff*abs_diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 2] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			psqdiff16x16[mb_index] += l_sqdiff;
+			pSd8x8[(mb_index << 2) + 2] = l_sd;
+			pMad8x8[(mb_index << 2) + 2] = l_mad;
+
+			l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
+			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+					int32_t abs_diff = WELS_ABS(diff);
+
+					l_sd += diff;
+					if (abs_diff>l_mad)
+					{
+						l_mad = abs_diff;
+					}
+					l_sad += abs_diff;
+					l_sqdiff += abs_diff*abs_diff;
+					l_sum += tmp_cur_row[l];
+					l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 3] = l_sad;
+			pSum16x16[mb_index] += l_sum;
+			psqsum16x16[mb_index] += l_sqsum;
+			psqdiff16x16[mb_index] += l_sqdiff;
+			pSd8x8[(mb_index << 2) + 3] = l_sd;
+			pMad8x8[(mb_index << 2) + 3] = l_mad;
+
+			tmp_ref += 16;
+			tmp_cur += 16;
+			++mb_index;
+		}
+		tmp_ref += step;
+		tmp_cur += step;
+	}
+}
+
+void VAACalcSadBgd_c(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride, 
+						int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSd8x8, uint8_t *pMad8x8)
+{
+	uint8_t *tmp_ref = pRefData;
+	uint8_t *tmp_cur = pCurData;
+	int32_t iMbWidth = (iPicWidth >> 4);
+	int32_t mb_heigth = (iPicHeight >> 4);
+	int32_t mb_index = 0;
+	int32_t pic_stride_x8 = iPicStride << 3;
+	int32_t step = (iPicStride << 4) - iPicWidth;
+
+	*pFrameSad = 0;
+	for (int32_t i = 0; i < mb_heigth; i ++)
+	{
+		for (int32_t j = 0; j < iMbWidth; j ++)
+		{
+			int32_t k, l;
+			int32_t l_sad,l_sd,l_mad;
+			uint8_t *tmp_cur_row;
+			uint8_t *tmp_ref_row;
+
+			l_mad = l_sd = l_sad =  0;
+			tmp_cur_row = tmp_cur;
+			tmp_ref_row = tmp_ref;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+					int32_t abs_diff = WELS_ABS(diff);
+					l_sd += diff;
+					l_sad += abs_diff;
+					if (abs_diff>l_mad)
+					{
+						l_mad = abs_diff;
+					}
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 0] = l_sad;
+			pSd8x8[(mb_index << 2) + 0] = l_sd;
+			pMad8x8[(mb_index << 2) + 0] = l_mad;
+
+			l_mad = l_sd = l_sad =  0;
+			tmp_cur_row = tmp_cur + 8;
+			tmp_ref_row = tmp_ref + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+					int32_t abs_diff = WELS_ABS(diff);
+					l_sd += diff;
+					l_sad += abs_diff;
+					if (abs_diff>l_mad)
+					{
+						l_mad = abs_diff;
+					}
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 1] = l_sad;
+			pSd8x8[(mb_index << 2) + 1] = l_sd;
+			pMad8x8[(mb_index << 2) + 1] = l_mad;
+
+			l_mad = l_sd = l_sad =  0;
+			tmp_cur_row = tmp_cur + pic_stride_x8;
+			tmp_ref_row = tmp_ref + pic_stride_x8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+					int32_t abs_diff = WELS_ABS(diff);
+					l_sd += diff;
+					l_sad += abs_diff;
+					if (abs_diff>l_mad)
+					{
+						l_mad = abs_diff;
+					}
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 2] = l_sad;
+			pSd8x8[(mb_index << 2) + 2] = l_sd;
+			pMad8x8[(mb_index << 2) + 2] = l_mad;
+
+			l_mad = l_sd = l_sad =  0;
+			tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+			tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+			for (k = 0; k < 8; k ++)
+			{
+				for (l = 0; l < 8; l ++)
+				{
+					int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+					int32_t abs_diff = WELS_ABS(diff);
+					l_sd += diff;
+					l_sad += abs_diff;
+					if (abs_diff>l_mad)
+					{
+						l_mad = abs_diff;
+					}
+				}
+				tmp_cur_row += iPicStride;
+				tmp_ref_row += iPicStride;
+			}
+			*pFrameSad += l_sad;
+			pSad8x8[(mb_index << 2) + 3] = l_sad;
+			pSd8x8[(mb_index << 2) + 3] = l_sd;
+			pMad8x8[(mb_index << 2) + 3] = l_mad;
+
+			tmp_ref += 16;
+			tmp_cur += 16;
+			++mb_index;
+		}
+		tmp_ref += step;
+		tmp_cur += step;
+	}
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/vaacalc/vaacalculation.cpp
@@ -1,0 +1,139 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "vaacalculation.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CVAACalculation::CVAACalculation(int32_t iCpuFlag)
+{
+	m_iCPUFlag = iCpuFlag;
+	m_eMethod   = METHOD_VAA_STATISTICS;
+
+	WelsMemset(&m_sCalcParam, 0, sizeof(m_sCalcParam));
+	WelsMemset(&m_sVaaFuncs, 0, sizeof(m_sVaaFuncs));
+	InitVaaFuncs(m_sVaaFuncs, m_iCPUFlag);
+}
+
+CVAACalculation::~CVAACalculation()
+{	
+}
+
+void CVAACalculation::InitVaaFuncs(SVaaFuncs &sVaaFuncs, int32_t iCpuFlag)
+{
+	sVaaFuncs.pfVAACalcSad				= VAACalcSad_c;
+	sVaaFuncs.pfVAACalcSadBgd			= VAACalcSadBgd_c;
+	sVaaFuncs.pfVAACalcSadSsd			= VAACalcSadSsd_c;
+	sVaaFuncs.pfVAACalcSadSsdBgd		= VAACalcSadSsdBgd_c;
+	sVaaFuncs.pfVAACalcSadVar			= VAACalcSadVar_c;
+#ifdef X86_ASM
+	if ( (iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2 )
+	{
+		sVaaFuncs.pfVAACalcSad			= VAACalcSad_sse2;
+		sVaaFuncs.pfVAACalcSadBgd		= VAACalcSadBgd_sse2;
+		sVaaFuncs.pfVAACalcSadSsd		= VAACalcSadSsd_sse2;
+		sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
+		sVaaFuncs.pfVAACalcSadVar		= VAACalcSadVar_sse2;
+	}
+#endif//X86_ASM
+}
+
+EResult CVAACalculation::Process(int32_t iType, SPixMap *pSrcPixMap, SPixMap *pRefPixMap)
+{
+	uint8_t *pCurData	= (uint8_t *)pSrcPixMap->pPixel[0];
+	uint8_t *pRefData	= (uint8_t *)pRefPixMap->pPixel[0];
+	int32_t iPicWidth	= pSrcPixMap->sRect.iRectWidth;
+	int32_t iPicHeight	= pSrcPixMap->sRect.iRectHeight;
+	int32_t iPicStride	= pSrcPixMap->iStride[0];
+	
+	SVAACalcResult *pResult = m_sCalcParam.pCalcResult;
+
+	if (pCurData == NULL || pRefData == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	pResult->pCurY = pCurData;
+	pResult->pRefY = pRefData;
+	if (m_sCalcParam.iCalcBgd)
+	{
+		if (m_sCalcParam.iCalcSsd)
+		{
+			m_sVaaFuncs.pfVAACalcSadSsdBgd(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad, 
+				(int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16, 
+				(int32_t*)pResult->pSumOfDiff8x8, (uint8_t*)pResult->pMad8x8);
+		}
+		else
+		{
+			m_sVaaFuncs.pfVAACalcSadBgd(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+				(int32_t*)(pResult->pSad8x8), (int32_t*)(pResult->pSumOfDiff8x8), (uint8_t*)pResult->pMad8x8);
+		}
+	}
+	else
+	{
+		if (m_sCalcParam.iCalcSsd)
+		{
+			m_sVaaFuncs.pfVAACalcSadSsd(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+				(int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16);
+		}else{
+			if (m_sCalcParam.iCalcVar)
+			{
+				m_sVaaFuncs.pfVAACalcSadVar(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+					(int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16);
+			}else{
+				m_sVaaFuncs.pfVAACalcSad(pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+					(int32_t*)pResult->pSad8x8);
+			}			
+		}
+	}
+
+	return RET_SUCCESS;
+}
+
+EResult CVAACalculation::Set(int32_t iType, void *pParam)
+{
+	if (pParam == NULL || ((SVAACalcParam*)pParam)->pCalcResult == NULL)
+	{
+		return RET_INVALIDPARAM;
+	}
+
+	m_sCalcParam = *(SVAACalcParam*)pParam;
+
+	return RET_SUCCESS;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/processing/src/vaacalc/vaacalculation.h
@@ -1,0 +1,122 @@
+/*!
+ * \copy
+ *     Copyright (c)  2011-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file	    :  vaacalculation.h
+ *
+ * \brief	    :  pVaa calculation class of wels video processor class
+ *
+ * \date        :  2011/03/18
+ *
+ * \description :  1. rewrite the package code of pVaa calculation class  
+ *
+ *************************************************************************************
+ */
+
+#ifndef _WELSVP_VAACALCULATION_H
+#define _WELSVP_VAACALCULATION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (VAACalcSadBgdFunc)( uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+								int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSd8x8, uint8_t *pMad8x8);
+
+typedef void (VAACalcSadSsdBgdFunc)(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+								int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *pSumSquare16x16, 
+												int32_t *pSsd16x16, int32_t *pSd8x8, uint8_t *pMad8x8);
+
+typedef void (VAACalcSadFunc)( uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+								int32_t *pFrameSad, int32_t *pSad8x8);
+
+typedef void (VAACalcSadVarFunc)( uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+								int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *pSumSquare16x16);
+
+typedef void (VAACalcSadSsdFunc)(uint8_t *pCurData, uint8_t *pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+								int32_t *pFrameSad, int32_t *pSad8x8, int32_t *pSum16x16, int32_t *pSumSquare16x16, int32_t *pSsd16x16);
+
+
+typedef VAACalcSadBgdFunc		* PVAACalcSadBgdFunc;
+typedef VAACalcSadSsdBgdFunc	* PVAACalcSadSsdBgdFunc;
+typedef VAACalcSadFunc			* PVAACalcSadFunc;
+typedef VAACalcSadVarFunc		* PVAACalcSadVarFunc;
+typedef VAACalcSadSsdFunc		* PVAACalcSadSsdFunc;
+
+typedef  struct TagVaaFuncs 
+{
+	PVAACalcSadBgdFunc		pfVAACalcSadBgd;
+	PVAACalcSadSsdBgdFunc	pfVAACalcSadSsdBgd;
+	PVAACalcSadFunc			pfVAACalcSad;
+	PVAACalcSadVarFunc		pfVAACalcSadVar;
+	PVAACalcSadSsdFunc		pfVAACalcSadSsd;
+} SVaaFuncs;
+
+
+VAACalcSadBgdFunc		VAACalcSadBgd_c;
+VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_c;
+VAACalcSadFunc			    VAACalcSad_c;
+VAACalcSadVarFunc		VAACalcSadVar_c;
+VAACalcSadSsdFunc		VAACalcSadSsd_c;
+
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+	VAACalcSadBgdFunc		VAACalcSadBgd_sse2;
+	VAACalcSadSsdBgdFunc	VAACalcSadSsdBgd_sse2;
+	VAACalcSadFunc			    VAACalcSad_sse2;
+	VAACalcSadVarFunc		VAACalcSadVar_sse2;
+	VAACalcSadSsdFunc		VAACalcSadSsd_sse2;
+WELSVP_EXTERN_C_END
+#endif
+
+class CVAACalculation : public IStrategy
+{			  
+public:
+	CVAACalculation(int32_t iCpuFlag);
+	~CVAACalculation();
+
+	EResult Process(int32_t iType, SPixMap *pCurPixMap, SPixMap *pRefPixMap);
+	EResult Set    (int32_t iType, void *pParam); 
+
+private:
+	void InitVaaFuncs(SVaaFuncs &sVaaFunc, int32_t iCpuFlag);
+
+private:
+	SVaaFuncs      m_sVaaFuncs;
+	int32_t       m_iCPUFlag;
+	SVAACalcParam m_sCalcParam;
+};	
+
+WELSVP_NAMESPACE_END
+
+#endif