ref: c29cc89c78dcd5cdf66e879f284de40cb0d85a08
dir: /vp9/common/arm/neon/vp9_idct16x16_neon.c/
/* * Copyright (c) 2013 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" void vp9_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output, int output_stride); void vp9_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, int16_t *pass1Output, int16_t skip_adding, uint8_t *dest, int dest_stride); void vp9_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output, int output_stride); void vp9_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, int16_t *pass1Output, int16_t skip_adding, uint8_t *dest, int dest_stride); /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ extern void vp9_push_neon(int64_t *store); extern void vp9_pop_neon(int64_t *store); void vp9_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, int dest_stride) { int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. vp9_idct16x16_256_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, dest, dest_stride); /* Parallel idct on the lower 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. vp9_idct16x16_256_add_neon_pass2(input+8*16+1, row_idct_output+8, pass1_output, 0, dest, dest_stride); /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, row_idct_output, pass1_output, 1, dest, dest_stride); /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, row_idct_output+8, pass1_output, 1, dest+8, dest_stride); // restore d8-d15 register values. vp9_pop_neon(store_reg); return; } void vp9_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, int dest_stride) { int64_t store_reg[8]; int16_t pass1_output[16*16] = {0}; int16_t row_idct_output[16*16] = {0}; // save d8-d15 register values. vp9_push_neon(store_reg); /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. vp9_idct16x16_10_add_neon_pass2(input+1, row_idct_output, pass1_output, 0, dest, dest_stride); /* Skip Parallel idct on the lower 8 rows as they are all 0s */ /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. vp9_idct16x16_256_add_neon_pass2(row_idct_output+1, row_idct_output, pass1_output, 1, dest, dest_stride); /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1, row_idct_output+8, pass1_output, 1, dest+8, dest_stride); // restore d8-d15 register values. vp9_pop_neon(store_reg); return; }