Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Staging: add echo cancelation module

This is used by mISDN and Zaptel drivers.

From: Steve Underwood <steveu@coppice.org>
From: David Rowe <david@rowetel.com>
Cc: Tzafrir Cohen <tzafrir.cohen@xorcom.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

authored by

David Rowe and committed by
Greg Kroah-Hartman
10602db8 00b3ed16

+1785
+2
drivers/staging/Kconfig
··· 39 39 40 40 source "drivers/staging/wlan-ng/Kconfig" 41 41 42 + source "drivers/staging/echo/Kconfig" 43 + 42 44 endif # STAGING
+1
drivers/staging/Makefile
··· 8 8 obj-$(CONFIG_USB_IP_COMMON) += usbip/ 9 9 obj-$(CONFIG_W35UND) += winbond/ 10 10 obj-$(CONFIG_PRISM2_USB) += wlan-ng/ 11 + obj-$(CONFIG_ECHO) += echo/
+9
drivers/staging/echo/Kconfig
··· 1 + config ECHO 2 + tristate "Line Echo Canceller support" 3 + default n 4 + ---help--- 5 + This driver provides line echo cancelling support for mISDN and 6 + Zaptel drivers. 7 + 8 + To compile this driver as a module, choose M here. The module 9 + will be called echo.
+1
drivers/staging/echo/Makefile
··· 1 + obj-$(CONFIG_ECHO) += echo.o
+10
drivers/staging/echo/TODO
··· 1 + TODO: 2 + - checkpatch.pl cleanups 3 + - Lindent 4 + - typedef removals 5 + - handle bit_operations.h (merge in or make part of common code?) 6 + - remove proc interface, only use echo.h interface (proc interface is 7 + racy and not correct.) 8 + 9 + Please send patches to Greg Kroah-Hartman <greg@kroah.com> and Cc: Steve 10 + Underwood <steveu@coppice.org> and David Rowe <david@rowetel.com>
+253
drivers/staging/echo/bit_operations.h
··· 1 + /* 2 + * SpanDSP - a series of DSP components for telephony 3 + * 4 + * bit_operations.h - Various bit level operations, such as bit reversal 5 + * 6 + * Written by Steve Underwood <steveu@coppice.org> 7 + * 8 + * Copyright (C) 2006 Steve Underwood 9 + * 10 + * All rights reserved. 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2, as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program; if not, write to the Free Software 23 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 24 + * 25 + * $Id: bit_operations.h,v 1.11 2006/11/28 15:37:03 steveu Exp $ 26 + */ 27 + 28 + /*! \file */ 29 + 30 + #if !defined(_BIT_OPERATIONS_H_) 31 + #define _BIT_OPERATIONS_H_ 32 + 33 + #ifdef __cplusplus 34 + extern "C" { 35 + #endif 36 + 37 + #if defined(__i386__) || defined(__x86_64__) 38 + /*! \brief Find the bit position of the highest set bit in a word 39 + \param bits The word to be searched 40 + \return The bit number of the highest set bit, or -1 if the word is zero. */ 41 + static __inline__ int top_bit(unsigned int bits) 42 + { 43 + int res; 44 + 45 + __asm__ (" xorl %[res],%[res];\n" 46 + " decl %[res];\n" 47 + " bsrl %[bits],%[res]\n" 48 + : [res] "=&r" (res) 49 + : [bits] "rm" (bits)); 50 + return res; 51 + } 52 + /*- End of function --------------------------------------------------------*/ 53 + 54 + /*! \brief Find the bit position of the lowest set bit in a word 55 + \param bits The word to be searched 56 + \return The bit number of the lowest set bit, or -1 if the word is zero. */ 57 + static __inline__ int bottom_bit(unsigned int bits) 58 + { 59 + int res; 60 + 61 + __asm__ (" xorl %[res],%[res];\n" 62 + " decl %[res];\n" 63 + " bsfl %[bits],%[res]\n" 64 + : [res] "=&r" (res) 65 + : [bits] "rm" (bits)); 66 + return res; 67 + } 68 + /*- End of function --------------------------------------------------------*/ 69 + #else 70 + static __inline__ int top_bit(unsigned int bits) 71 + { 72 + int i; 73 + 74 + if (bits == 0) 75 + return -1; 76 + i = 0; 77 + if (bits & 0xFFFF0000) 78 + { 79 + bits &= 0xFFFF0000; 80 + i += 16; 81 + } 82 + if (bits & 0xFF00FF00) 83 + { 84 + bits &= 0xFF00FF00; 85 + i += 8; 86 + } 87 + if (bits & 0xF0F0F0F0) 88 + { 89 + bits &= 0xF0F0F0F0; 90 + i += 4; 91 + } 92 + if (bits & 0xCCCCCCCC) 93 + { 94 + bits &= 0xCCCCCCCC; 95 + i += 2; 96 + } 97 + if (bits & 0xAAAAAAAA) 98 + { 99 + bits &= 0xAAAAAAAA; 100 + i += 1; 101 + } 102 + return i; 103 + } 104 + /*- End of function --------------------------------------------------------*/ 105 + 106 + static __inline__ int bottom_bit(unsigned int bits) 107 + { 108 + int i; 109 + 110 + if (bits == 0) 111 + return -1; 112 + i = 32; 113 + if (bits & 0x0000FFFF) 114 + { 115 + bits &= 0x0000FFFF; 116 + i -= 16; 117 + } 118 + if (bits & 0x00FF00FF) 119 + { 120 + bits &= 0x00FF00FF; 121 + i -= 8; 122 + } 123 + if (bits & 0x0F0F0F0F) 124 + { 125 + bits &= 0x0F0F0F0F; 126 + i -= 4; 127 + } 128 + if (bits & 0x33333333) 129 + { 130 + bits &= 0x33333333; 131 + i -= 2; 132 + } 133 + if (bits & 0x55555555) 134 + { 135 + bits &= 0x55555555; 136 + i -= 1; 137 + } 138 + return i; 139 + } 140 + /*- End of function --------------------------------------------------------*/ 141 + #endif 142 + 143 + /*! \brief Bit reverse a byte. 144 + \param data The byte to be reversed. 145 + \return The bit reversed version of data. */ 146 + static __inline__ uint8_t bit_reverse8(uint8_t x) 147 + { 148 + #if defined(__i386__) || defined(__x86_64__) 149 + /* If multiply is fast */ 150 + return ((x*0x0802U & 0x22110U) | (x*0x8020U & 0x88440U))*0x10101U >> 16; 151 + #else 152 + /* If multiply is slow, but we have a barrel shifter */ 153 + x = (x >> 4) | (x << 4); 154 + x = ((x & 0xCC) >> 2) | ((x & 0x33) << 2); 155 + return ((x & 0xAA) >> 1) | ((x & 0x55) << 1); 156 + #endif 157 + } 158 + /*- End of function --------------------------------------------------------*/ 159 + 160 + /*! \brief Bit reverse a 16 bit word. 161 + \param data The word to be reversed. 162 + \return The bit reversed version of data. */ 163 + uint16_t bit_reverse16(uint16_t data); 164 + 165 + /*! \brief Bit reverse a 32 bit word. 166 + \param data The word to be reversed. 167 + \return The bit reversed version of data. */ 168 + uint32_t bit_reverse32(uint32_t data); 169 + 170 + /*! \brief Bit reverse each of the four bytes in a 32 bit word. 171 + \param data The word to be reversed. 172 + \return The bit reversed version of data. */ 173 + uint32_t bit_reverse_4bytes(uint32_t data); 174 + 175 + /*! \brief Find the number of set bits in a 32 bit word. 176 + \param x The word to be searched. 177 + \return The number of set bits. */ 178 + int one_bits32(uint32_t x); 179 + 180 + /*! \brief Create a mask as wide as the number in a 32 bit word. 181 + \param x The word to be searched. 182 + \return The mask. */ 183 + uint32_t make_mask32(uint32_t x); 184 + 185 + /*! \brief Create a mask as wide as the number in a 16 bit word. 186 + \param x The word to be searched. 187 + \return The mask. */ 188 + uint16_t make_mask16(uint16_t x); 189 + 190 + /*! \brief Find the least significant one in a word, and return a word 191 + with just that bit set. 192 + \param x The word to be searched. 193 + \return The word with the single set bit. */ 194 + static __inline__ uint32_t least_significant_one32(uint32_t x) 195 + { 196 + return (x & (-(int32_t) x)); 197 + } 198 + /*- End of function --------------------------------------------------------*/ 199 + 200 + /*! \brief Find the most significant one in a word, and return a word 201 + with just that bit set. 202 + \param x The word to be searched. 203 + \return The word with the single set bit. */ 204 + static __inline__ uint32_t most_significant_one32(uint32_t x) 205 + { 206 + #if defined(__i386__) || defined(__x86_64__) 207 + return 1 << top_bit(x); 208 + #else 209 + x = make_mask32(x); 210 + return (x ^ (x >> 1)); 211 + #endif 212 + } 213 + /*- End of function --------------------------------------------------------*/ 214 + 215 + /*! \brief Find the parity of a byte. 216 + \param x The byte to be checked. 217 + \return 1 for odd, or 0 for even. */ 218 + static __inline__ int parity8(uint8_t x) 219 + { 220 + x = (x ^ (x >> 4)) & 0x0F; 221 + return (0x6996 >> x) & 1; 222 + } 223 + /*- End of function --------------------------------------------------------*/ 224 + 225 + /*! \brief Find the parity of a 16 bit word. 226 + \param x The word to be checked. 227 + \return 1 for odd, or 0 for even. */ 228 + static __inline__ int parity16(uint16_t x) 229 + { 230 + x ^= (x >> 8); 231 + x = (x ^ (x >> 4)) & 0x0F; 232 + return (0x6996 >> x) & 1; 233 + } 234 + /*- End of function --------------------------------------------------------*/ 235 + 236 + /*! \brief Find the parity of a 32 bit word. 237 + \param x The word to be checked. 238 + \return 1 for odd, or 0 for even. */ 239 + static __inline__ int parity32(uint32_t x) 240 + { 241 + x ^= (x >> 16); 242 + x ^= (x >> 8); 243 + x = (x ^ (x >> 4)) & 0x0F; 244 + return (0x6996 >> x) & 1; 245 + } 246 + /*- End of function --------------------------------------------------------*/ 247 + 248 + #ifdef __cplusplus 249 + } 250 + #endif 251 + 252 + #endif 253 + /*- End of file ------------------------------------------------------------*/
+632
drivers/staging/echo/echo.c
··· 1 + /* 2 + * SpanDSP - a series of DSP components for telephony 3 + * 4 + * echo.c - A line echo canceller. This code is being developed 5 + * against and partially complies with G168. 6 + * 7 + * Written by Steve Underwood <steveu@coppice.org> 8 + * and David Rowe <david_at_rowetel_dot_com> 9 + * 10 + * Copyright (C) 2001, 2003 Steve Underwood, 2007 David Rowe 11 + * 12 + * Based on a bit from here, a bit from there, eye of toad, ear of 13 + * bat, 15 years of failed attempts by David and a few fried brain 14 + * cells. 15 + * 16 + * All rights reserved. 17 + * 18 + * This program is free software; you can redistribute it and/or modify 19 + * it under the terms of the GNU General Public License version 2, as 20 + * published by the Free Software Foundation. 21 + * 22 + * This program is distributed in the hope that it will be useful, 23 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 + * GNU General Public License for more details. 26 + * 27 + * You should have received a copy of the GNU General Public License 28 + * along with this program; if not, write to the Free Software 29 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 30 + * 31 + * $Id: echo.c,v 1.20 2006/12/01 18:00:48 steveu Exp $ 32 + */ 33 + 34 + /*! \file */ 35 + 36 + /* Implementation Notes 37 + David Rowe 38 + April 2007 39 + 40 + This code started life as Steve's NLMS algorithm with a tap 41 + rotation algorithm to handle divergence during double talk. I 42 + added a Geigel Double Talk Detector (DTD) [2] and performed some 43 + G168 tests. However I had trouble meeting the G168 requirements, 44 + especially for double talk - there were always cases where my DTD 45 + failed, for example where near end speech was under the 6dB 46 + threshold required for declaring double talk. 47 + 48 + So I tried a two path algorithm [1], which has so far given better 49 + results. The original tap rotation/Geigel algorithm is available 50 + in SVN http://svn.rowetel.com/software/oslec/tags/before_16bit. 51 + It's probably possible to make it work if some one wants to put some 52 + serious work into it. 53 + 54 + At present no special treatment is provided for tones, which 55 + generally cause NLMS algorithms to diverge. Initial runs of a 56 + subset of the G168 tests for tones (e.g ./echo_test 6) show the 57 + current algorithm is passing OK, which is kind of surprising. The 58 + full set of tests needs to be performed to confirm this result. 59 + 60 + One other interesting change is that I have managed to get the NLMS 61 + code to work with 16 bit coefficients, rather than the original 32 62 + bit coefficents. This reduces the MIPs and storage required. 63 + I evaulated the 16 bit port using g168_tests.sh and listening tests 64 + on 4 real-world samples. 65 + 66 + I also attempted the implementation of a block based NLMS update 67 + [2] but although this passes g168_tests.sh it didn't converge well 68 + on the real-world samples. I have no idea why, perhaps a scaling 69 + problem. The block based code is also available in SVN 70 + http://svn.rowetel.com/software/oslec/tags/before_16bit. If this 71 + code can be debugged, it will lead to further reduction in MIPS, as 72 + the block update code maps nicely onto DSP instruction sets (it's a 73 + dot product) compared to the current sample-by-sample update. 74 + 75 + Steve also has some nice notes on echo cancellers in echo.h 76 + 77 + 78 + References: 79 + 80 + [1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo 81 + Path Models", IEEE Transactions on communications, COM-25, 82 + No. 6, June 83 + 1977. 84 + http://www.rowetel.com/images/echo/dual_path_paper.pdf 85 + 86 + [2] The classic, very useful paper that tells you how to 87 + actually build a real world echo canceller: 88 + Messerschmitt, Hedberg, Cole, Haoui, Winship, "Digital Voice 89 + Echo Canceller with a TMS320020, 90 + http://www.rowetel.com/images/echo/spra129.pdf 91 + 92 + [3] I have written a series of blog posts on this work, here is 93 + Part 1: http://www.rowetel.com/blog/?p=18 94 + 95 + [4] The source code http://svn.rowetel.com/software/oslec/ 96 + 97 + [5] A nice reference on LMS filters: 98 + http://en.wikipedia.org/wiki/Least_mean_squares_filter 99 + 100 + Credits: 101 + 102 + Thanks to Steve Underwood, Jean-Marc Valin, and Ramakrishnan 103 + Muthukrishnan for their suggestions and email discussions. Thanks 104 + also to those people who collected echo samples for me such as 105 + Mark, Pawel, and Pavel. 106 + */ 107 + 108 + #include <linux/kernel.h> /* We're doing kernel work */ 109 + #include <linux/module.h> 110 + #include <linux/kernel.h> 111 + #include <linux/slab.h> 112 + #define malloc(a) kmalloc((a), GFP_KERNEL) 113 + #define free(a) kfree(a) 114 + 115 + #include "bit_operations.h" 116 + #include "echo.h" 117 + 118 + #define MIN_TX_POWER_FOR_ADAPTION 64 119 + #define MIN_RX_POWER_FOR_ADAPTION 64 120 + #define DTD_HANGOVER 600 /* 600 samples, or 75ms */ 121 + #define DC_LOG2BETA 3 /* log2() of DC filter Beta */ 122 + 123 + /*-----------------------------------------------------------------------*\ 124 + FUNCTIONS 125 + \*-----------------------------------------------------------------------*/ 126 + 127 + /* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */ 128 + 129 + 130 + #ifdef __BLACKFIN_ASM__ 131 + static void __inline__ lms_adapt_bg(echo_can_state_t *ec, int clean, int shift) 132 + { 133 + int i, j; 134 + int offset1; 135 + int offset2; 136 + int factor; 137 + int exp; 138 + int16_t *phist; 139 + int n; 140 + 141 + if (shift > 0) 142 + factor = clean << shift; 143 + else 144 + factor = clean >> -shift; 145 + 146 + /* Update the FIR taps */ 147 + 148 + offset2 = ec->curr_pos; 149 + offset1 = ec->taps - offset2; 150 + phist = &ec->fir_state_bg.history[offset2]; 151 + 152 + /* st: and en: help us locate the assembler in echo.s */ 153 + 154 + //asm("st:"); 155 + n = ec->taps; 156 + for (i = 0, j = offset2; i < n; i++, j++) 157 + { 158 + exp = *phist++ * factor; 159 + ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15); 160 + } 161 + //asm("en:"); 162 + 163 + /* Note the asm for the inner loop above generated by Blackfin gcc 164 + 4.1.1 is pretty good (note even parallel instructions used): 165 + 166 + R0 = W [P0++] (X); 167 + R0 *= R2; 168 + R0 = R0 + R3 (NS) || 169 + R1 = W [P1] (X) || 170 + nop; 171 + R0 >>>= 15; 172 + R0 = R0 + R1; 173 + W [P1++] = R0; 174 + 175 + A block based update algorithm would be much faster but the 176 + above can't be improved on much. Every instruction saved in 177 + the loop above is 2 MIPs/ch! The for loop above is where the 178 + Blackfin spends most of it's time - about 17 MIPs/ch measured 179 + with speedtest.c with 256 taps (32ms). Write-back and 180 + Write-through cache gave about the same performance. 181 + */ 182 + } 183 + 184 + /* 185 + IDEAS for further optimisation of lms_adapt_bg(): 186 + 187 + 1/ The rounding is quite costly. Could we keep as 32 bit coeffs 188 + then make filter pluck the MS 16-bits of the coeffs when filtering? 189 + However this would lower potential optimisation of filter, as I 190 + think the dual-MAC architecture requires packed 16 bit coeffs. 191 + 192 + 2/ Block based update would be more efficient, as per comments above, 193 + could use dual MAC architecture. 194 + 195 + 3/ Look for same sample Blackfin LMS code, see if we can get dual-MAC 196 + packing. 197 + 198 + 4/ Execute the whole e/c in a block of say 20ms rather than sample 199 + by sample. Processing a few samples every ms is inefficient. 200 + */ 201 + 202 + #else 203 + static __inline__ void lms_adapt_bg(echo_can_state_t *ec, int clean, int shift) 204 + { 205 + int i; 206 + 207 + int offset1; 208 + int offset2; 209 + int factor; 210 + int exp; 211 + 212 + if (shift > 0) 213 + factor = clean << shift; 214 + else 215 + factor = clean >> -shift; 216 + 217 + /* Update the FIR taps */ 218 + 219 + offset2 = ec->curr_pos; 220 + offset1 = ec->taps - offset2; 221 + 222 + for (i = ec->taps - 1; i >= offset1; i--) 223 + { 224 + exp = (ec->fir_state_bg.history[i - offset1]*factor); 225 + ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15); 226 + } 227 + for ( ; i >= 0; i--) 228 + { 229 + exp = (ec->fir_state_bg.history[i + offset2]*factor); 230 + ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15); 231 + } 232 + } 233 + #endif 234 + 235 + /*- End of function --------------------------------------------------------*/ 236 + 237 + echo_can_state_t *echo_can_create(int len, int adaption_mode) 238 + { 239 + echo_can_state_t *ec; 240 + int i; 241 + int j; 242 + 243 + ec = kmalloc(sizeof(*ec), GFP_KERNEL); 244 + if (ec == NULL) 245 + return NULL; 246 + memset(ec, 0, sizeof(*ec)); 247 + 248 + ec->taps = len; 249 + ec->log2taps = top_bit(len); 250 + ec->curr_pos = ec->taps - 1; 251 + 252 + for (i = 0; i < 2; i++) 253 + { 254 + if ((ec->fir_taps16[i] = (int16_t *) malloc((ec->taps)*sizeof(int16_t))) == NULL) 255 + { 256 + for (j = 0; j < i; j++) 257 + kfree(ec->fir_taps16[j]); 258 + kfree(ec); 259 + return NULL; 260 + } 261 + memset(ec->fir_taps16[i], 0, (ec->taps)*sizeof(int16_t)); 262 + } 263 + 264 + fir16_create(&ec->fir_state, 265 + ec->fir_taps16[0], 266 + ec->taps); 267 + fir16_create(&ec->fir_state_bg, 268 + ec->fir_taps16[1], 269 + ec->taps); 270 + 271 + for(i=0; i<5; i++) { 272 + ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0; 273 + } 274 + 275 + ec->cng_level = 1000; 276 + echo_can_adaption_mode(ec, adaption_mode); 277 + 278 + ec->snapshot = (int16_t*)malloc(ec->taps*sizeof(int16_t)); 279 + memset(ec->snapshot, 0, sizeof(int16_t)*ec->taps); 280 + 281 + ec->cond_met = 0; 282 + ec->Pstates = 0; 283 + ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0; 284 + ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0; 285 + ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0; 286 + ec->Lbgn = ec->Lbgn_acc = 0; 287 + ec->Lbgn_upper = 200; 288 + ec->Lbgn_upper_acc = ec->Lbgn_upper << 13; 289 + 290 + return ec; 291 + } 292 + /*- End of function --------------------------------------------------------*/ 293 + 294 + void echo_can_free(echo_can_state_t *ec) 295 + { 296 + int i; 297 + 298 + fir16_free(&ec->fir_state); 299 + fir16_free(&ec->fir_state_bg); 300 + for (i = 0; i < 2; i++) 301 + kfree(ec->fir_taps16[i]); 302 + kfree(ec->snapshot); 303 + kfree(ec); 304 + } 305 + /*- End of function --------------------------------------------------------*/ 306 + 307 + void echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode) 308 + { 309 + ec->adaption_mode = adaption_mode; 310 + } 311 + /*- End of function --------------------------------------------------------*/ 312 + 313 + void echo_can_flush(echo_can_state_t *ec) 314 + { 315 + int i; 316 + 317 + ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0; 318 + ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0; 319 + ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0; 320 + 321 + ec->Lbgn = ec->Lbgn_acc = 0; 322 + ec->Lbgn_upper = 200; 323 + ec->Lbgn_upper_acc = ec->Lbgn_upper << 13; 324 + 325 + ec->nonupdate_dwell = 0; 326 + 327 + fir16_flush(&ec->fir_state); 328 + fir16_flush(&ec->fir_state_bg); 329 + ec->fir_state.curr_pos = ec->taps - 1; 330 + ec->fir_state_bg.curr_pos = ec->taps - 1; 331 + for (i = 0; i < 2; i++) 332 + memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t)); 333 + 334 + ec->curr_pos = ec->taps - 1; 335 + ec->Pstates = 0; 336 + } 337 + /*- End of function --------------------------------------------------------*/ 338 + 339 + void echo_can_snapshot(echo_can_state_t *ec) { 340 + memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps*sizeof(int16_t)); 341 + } 342 + /*- End of function --------------------------------------------------------*/ 343 + 344 + /* Dual Path Echo Canceller ------------------------------------------------*/ 345 + 346 + int16_t echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx) 347 + { 348 + int32_t echo_value; 349 + int clean_bg; 350 + int tmp, tmp1; 351 + 352 + /* Input scaling was found be required to prevent problems when tx 353 + starts clipping. Another possible way to handle this would be the 354 + filter coefficent scaling. */ 355 + 356 + ec->tx = tx; ec->rx = rx; 357 + tx >>=1; 358 + rx >>=1; 359 + 360 + /* 361 + Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required 362 + otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta) 363 + only real axis. Some chip sets (like Si labs) don't need 364 + this, but something like a $10 X100P card does. Any DC really slows 365 + down convergence. 366 + 367 + Note: removes some low frequency from the signal, this reduces 368 + the speech quality when listening to samples through headphones 369 + but may not be obvious through a telephone handset. 370 + 371 + Note that the 3dB frequency in radians is approx Beta, e.g. for 372 + Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz. 373 + */ 374 + 375 + if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) { 376 + tmp = rx << 15; 377 + #if 1 378 + /* Make sure the gain of the HPF is 1.0. This can still saturate a little under 379 + impulse conditions, and it might roll to 32768 and need clipping on sustained peak 380 + level signals. However, the scale of such clipping is small, and the error due to 381 + any saturation should not markedly affect the downstream processing. */ 382 + tmp -= (tmp >> 4); 383 + #endif 384 + ec->rx_1 += -(ec->rx_1>>DC_LOG2BETA) + tmp - ec->rx_2; 385 + 386 + /* hard limit filter to prevent clipping. Note that at this stage 387 + rx should be limited to +/- 16383 due to right shift above */ 388 + tmp1 = ec->rx_1 >> 15; 389 + if (tmp1 > 16383) tmp1 = 16383; 390 + if (tmp1 < -16383) tmp1 = -16383; 391 + rx = tmp1; 392 + ec->rx_2 = tmp; 393 + } 394 + 395 + /* Block average of power in the filter states. Used for 396 + adaption power calculation. */ 397 + 398 + { 399 + int new, old; 400 + 401 + /* efficient "out with the old and in with the new" algorithm so 402 + we don't have to recalculate over the whole block of 403 + samples. */ 404 + new = (int)tx * (int)tx; 405 + old = (int)ec->fir_state.history[ec->fir_state.curr_pos] * 406 + (int)ec->fir_state.history[ec->fir_state.curr_pos]; 407 + ec->Pstates += ((new - old) + (1<<ec->log2taps)) >> ec->log2taps; 408 + if (ec->Pstates < 0) ec->Pstates = 0; 409 + } 410 + 411 + /* Calculate short term average levels using simple single pole IIRs */ 412 + 413 + ec->Ltxacc += abs(tx) - ec->Ltx; 414 + ec->Ltx = (ec->Ltxacc + (1<<4)) >> 5; 415 + ec->Lrxacc += abs(rx) - ec->Lrx; 416 + ec->Lrx = (ec->Lrxacc + (1<<4)) >> 5; 417 + 418 + /* Foreground filter ---------------------------------------------------*/ 419 + 420 + ec->fir_state.coeffs = ec->fir_taps16[0]; 421 + echo_value = fir16(&ec->fir_state, tx); 422 + ec->clean = rx - echo_value; 423 + ec->Lcleanacc += abs(ec->clean) - ec->Lclean; 424 + ec->Lclean = (ec->Lcleanacc + (1<<4)) >> 5; 425 + 426 + /* Background filter ---------------------------------------------------*/ 427 + 428 + echo_value = fir16(&ec->fir_state_bg, tx); 429 + clean_bg = rx - echo_value; 430 + ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg; 431 + ec->Lclean_bg = (ec->Lclean_bgacc + (1<<4)) >> 5; 432 + 433 + /* Background Filter adaption -----------------------------------------*/ 434 + 435 + /* Almost always adap bg filter, just simple DT and energy 436 + detection to minimise adaption in cases of strong double talk. 437 + However this is not critical for the dual path algorithm. 438 + */ 439 + ec->factor = 0; 440 + ec->shift = 0; 441 + if ((ec->nonupdate_dwell == 0)) { 442 + int P, logP, shift; 443 + 444 + /* Determine: 445 + 446 + f = Beta * clean_bg_rx/P ------ (1) 447 + 448 + where P is the total power in the filter states. 449 + 450 + The Boffins have shown that if we obey (1) we converge 451 + quickly and avoid instability. 452 + 453 + The correct factor f must be in Q30, as this is the fixed 454 + point format required by the lms_adapt_bg() function, 455 + therefore the scaled version of (1) is: 456 + 457 + (2^30) * f = (2^30) * Beta * clean_bg_rx/P 458 + factor = (2^30) * Beta * clean_bg_rx/P ----- (2) 459 + 460 + We have chosen Beta = 0.25 by experiment, so: 461 + 462 + factor = (2^30) * (2^-2) * clean_bg_rx/P 463 + 464 + (30 - 2 - log2(P)) 465 + factor = clean_bg_rx 2 ----- (3) 466 + 467 + To avoid a divide we approximate log2(P) as top_bit(P), 468 + which returns the position of the highest non-zero bit in 469 + P. This approximation introduces an error as large as a 470 + factor of 2, but the algorithm seems to handle it OK. 471 + 472 + Come to think of it a divide may not be a big deal on a 473 + modern DSP, so its probably worth checking out the cycles 474 + for a divide versus a top_bit() implementation. 475 + */ 476 + 477 + P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates; 478 + logP = top_bit(P) + ec->log2taps; 479 + shift = 30 - 2 - logP; 480 + ec->shift = shift; 481 + 482 + lms_adapt_bg(ec, clean_bg, shift); 483 + } 484 + 485 + /* very simple DTD to make sure we dont try and adapt with strong 486 + near end speech */ 487 + 488 + ec->adapt = 0; 489 + if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx)) 490 + ec->nonupdate_dwell = DTD_HANGOVER; 491 + if (ec->nonupdate_dwell) 492 + ec->nonupdate_dwell--; 493 + 494 + /* Transfer logic ------------------------------------------------------*/ 495 + 496 + /* These conditions are from the dual path paper [1], I messed with 497 + them a bit to improve performance. */ 498 + 499 + if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) && 500 + (ec->nonupdate_dwell == 0) && 501 + (8*ec->Lclean_bg < 7*ec->Lclean) /* (ec->Lclean_bg < 0.875*ec->Lclean) */ && 502 + (8*ec->Lclean_bg < ec->Ltx) /* (ec->Lclean_bg < 0.125*ec->Ltx) */ ) 503 + { 504 + if (ec->cond_met == 6) { 505 + /* BG filter has had better results for 6 consecutive samples */ 506 + ec->adapt = 1; 507 + memcpy(ec->fir_taps16[0], ec->fir_taps16[1], ec->taps*sizeof(int16_t)); 508 + } 509 + else 510 + ec->cond_met++; 511 + } 512 + else 513 + ec->cond_met = 0; 514 + 515 + /* Non-Linear Processing ---------------------------------------------------*/ 516 + 517 + ec->clean_nlp = ec->clean; 518 + if (ec->adaption_mode & ECHO_CAN_USE_NLP) 519 + { 520 + /* Non-linear processor - a fancy way to say "zap small signals, to avoid 521 + residual echo due to (uLaw/ALaw) non-linearity in the channel.". */ 522 + 523 + if ((16*ec->Lclean < ec->Ltx)) 524 + { 525 + /* Our e/c has improved echo by at least 24 dB (each factor of 2 is 6dB, 526 + so 2*2*2*2=16 is the same as 6+6+6+6=24dB) */ 527 + if (ec->adaption_mode & ECHO_CAN_USE_CNG) 528 + { 529 + ec->cng_level = ec->Lbgn; 530 + 531 + /* Very elementary comfort noise generation. Just random 532 + numbers rolled off very vaguely Hoth-like. DR: This 533 + noise doesn't sound quite right to me - I suspect there 534 + are some overlfow issues in the filtering as it's too 535 + "crackly". TODO: debug this, maybe just play noise at 536 + high level or look at spectrum. 537 + */ 538 + 539 + ec->cng_rndnum = 1664525U*ec->cng_rndnum + 1013904223U; 540 + ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5*ec->cng_filter) >> 3; 541 + ec->clean_nlp = (ec->cng_filter*ec->cng_level*8) >> 14; 542 + 543 + } 544 + else if (ec->adaption_mode & ECHO_CAN_USE_CLIP) 545 + { 546 + /* This sounds much better than CNG */ 547 + if (ec->clean_nlp > ec->Lbgn) 548 + ec->clean_nlp = ec->Lbgn; 549 + if (ec->clean_nlp < -ec->Lbgn) 550 + ec->clean_nlp = -ec->Lbgn; 551 + } 552 + else 553 + { 554 + /* just mute the residual, doesn't sound very good, used mainly 555 + in G168 tests */ 556 + ec->clean_nlp = 0; 557 + } 558 + } 559 + else { 560 + /* Background noise estimator. I tried a few algorithms 561 + here without much luck. This very simple one seems to 562 + work best, we just average the level using a slow (1 sec 563 + time const) filter if the current level is less than a 564 + (experimentally derived) constant. This means we dont 565 + include high level signals like near end speech. When 566 + combined with CNG or especially CLIP seems to work OK. 567 + */ 568 + if (ec->Lclean < 40) { 569 + ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn; 570 + ec->Lbgn = (ec->Lbgn_acc + (1<<11)) >> 12; 571 + } 572 + } 573 + } 574 + 575 + /* Roll around the taps buffer */ 576 + if (ec->curr_pos <= 0) 577 + ec->curr_pos = ec->taps; 578 + ec->curr_pos--; 579 + 580 + if (ec->adaption_mode & ECHO_CAN_DISABLE) 581 + ec->clean_nlp = rx; 582 + 583 + /* Output scaled back up again to match input scaling */ 584 + 585 + return (int16_t) ec->clean_nlp << 1; 586 + } 587 + 588 + /*- End of function --------------------------------------------------------*/ 589 + 590 + /* This function is seperated from the echo canceller is it is usually called 591 + as part of the tx process. See rx HP (DC blocking) filter above, it's 592 + the same design. 593 + 594 + Some soft phones send speech signals with a lot of low frequency 595 + energy, e.g. down to 20Hz. This can make the hybrid non-linear 596 + which causes the echo canceller to fall over. This filter can help 597 + by removing any low frequency before it gets to the tx port of the 598 + hybrid. 599 + 600 + It can also help by removing and DC in the tx signal. DC is bad 601 + for LMS algorithms. 602 + 603 + This is one of the classic DC removal filters, adjusted to provide sufficient 604 + bass rolloff to meet the above requirement to protect hybrids from things that 605 + upset them. The difference between successive samples produces a lousy HPF, and 606 + then a suitably placed pole flattens things out. The final result is a nicely 607 + rolled off bass end. The filtering is implemented with extended fractional 608 + precision, which noise shapes things, giving very clean DC removal. 609 + */ 610 + 611 + int16_t echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx) { 612 + int tmp, tmp1; 613 + 614 + if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) { 615 + tmp = tx << 15; 616 + #if 1 617 + /* Make sure the gain of the HPF is 1.0. The first can still saturate a little under 618 + impulse conditions, and it might roll to 32768 and need clipping on sustained peak 619 + level signals. However, the scale of such clipping is small, and the error due to 620 + any saturation should not markedly affect the downstream processing. */ 621 + tmp -= (tmp >> 4); 622 + #endif 623 + ec->tx_1 += -(ec->tx_1>>DC_LOG2BETA) + tmp - ec->tx_2; 624 + tmp1 = ec->tx_1 >> 15; 625 + if (tmp1 > 32767) tmp1 = 32767; 626 + if (tmp1 < -32767) tmp1 = -32767; 627 + tx = tmp1; 628 + ec->tx_2 = tmp; 629 + } 630 + 631 + return tx; 632 + }
+220
drivers/staging/echo/echo.h
··· 1 + /* 2 + * SpanDSP - a series of DSP components for telephony 3 + * 4 + * echo.c - A line echo canceller. This code is being developed 5 + * against and partially complies with G168. 6 + * 7 + * Written by Steve Underwood <steveu@coppice.org> 8 + * and David Rowe <david_at_rowetel_dot_com> 9 + * 10 + * Copyright (C) 2001 Steve Underwood and 2007 David Rowe 11 + * 12 + * All rights reserved. 13 + * 14 + * This program is free software; you can redistribute it and/or modify 15 + * it under the terms of the GNU General Public License version 2, as 16 + * published by the Free Software Foundation. 17 + * 18 + * This program is distributed in the hope that it will be useful, 19 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 20 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 21 + * GNU General Public License for more details. 22 + * 23 + * You should have received a copy of the GNU General Public License 24 + * along with this program; if not, write to the Free Software 25 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 26 + * 27 + * $Id: echo.h,v 1.9 2006/10/24 13:45:28 steveu Exp $ 28 + */ 29 + 30 + #ifndef __ECHO_H 31 + #define __ECHO_H 32 + 33 + /*! \page echo_can_page Line echo cancellation for voice 34 + 35 + \section echo_can_page_sec_1 What does it do? 36 + This module aims to provide G.168-2002 compliant echo cancellation, to remove 37 + electrical echoes (e.g. from 2-4 wire hybrids) from voice calls. 38 + 39 + \section echo_can_page_sec_2 How does it work? 40 + The heart of the echo cancellor is FIR filter. This is adapted to match the 41 + echo impulse response of the telephone line. It must be long enough to 42 + adequately cover the duration of that impulse response. The signal transmitted 43 + to the telephone line is passed through the FIR filter. Once the FIR is 44 + properly adapted, the resulting output is an estimate of the echo signal 45 + received from the line. This is subtracted from the received signal. The result 46 + is an estimate of the signal which originated at the far end of the line, free 47 + from echos of our own transmitted signal. 48 + 49 + The least mean squares (LMS) algorithm is attributed to Widrow and Hoff, and 50 + was introduced in 1960. It is the commonest form of filter adaption used in 51 + things like modem line equalisers and line echo cancellers. There it works very 52 + well. However, it only works well for signals of constant amplitude. It works 53 + very poorly for things like speech echo cancellation, where the signal level 54 + varies widely. This is quite easy to fix. If the signal level is normalised - 55 + similar to applying AGC - LMS can work as well for a signal of varying 56 + amplitude as it does for a modem signal. This normalised least mean squares 57 + (NLMS) algorithm is the commonest one used for speech echo cancellation. Many 58 + other algorithms exist - e.g. RLS (essentially the same as Kalman filtering), 59 + FAP, etc. Some perform significantly better than NLMS. However, factors such 60 + as computational complexity and patents favour the use of NLMS. 61 + 62 + A simple refinement to NLMS can improve its performance with speech. NLMS tends 63 + to adapt best to the strongest parts of a signal. If the signal is white noise, 64 + the NLMS algorithm works very well. However, speech has more low frequency than 65 + high frequency content. Pre-whitening (i.e. filtering the signal to flatten its 66 + spectrum) the echo signal improves the adapt rate for speech, and ensures the 67 + final residual signal is not heavily biased towards high frequencies. A very 68 + low complexity filter is adequate for this, so pre-whitening adds little to the 69 + compute requirements of the echo canceller. 70 + 71 + An FIR filter adapted using pre-whitened NLMS performs well, provided certain 72 + conditions are met: 73 + 74 + - The transmitted signal has poor self-correlation. 75 + - There is no signal being generated within the environment being 76 + cancelled. 77 + 78 + The difficulty is that neither of these can be guaranteed. 79 + 80 + If the adaption is performed while transmitting noise (or something fairly 81 + noise like, such as voice) the adaption works very well. If the adaption is 82 + performed while transmitting something highly correlative (typically narrow 83 + band energy such as signalling tones or DTMF), the adaption can go seriously 84 + wrong. The reason is there is only one solution for the adaption on a near 85 + random signal - the impulse response of the line. For a repetitive signal, 86 + there are any number of solutions which converge the adaption, and nothing 87 + guides the adaption to choose the generalised one. Allowing an untrained 88 + canceller to converge on this kind of narrowband energy probably a good thing, 89 + since at least it cancels the tones. Allowing a well converged canceller to 90 + continue converging on such energy is just a way to ruin its generalised 91 + adaption. A narrowband detector is needed, so adapation can be suspended at 92 + appropriate times. 93 + 94 + The adaption process is based on trying to eliminate the received signal. When 95 + there is any signal from within the environment being cancelled it may upset 96 + the adaption process. Similarly, if the signal we are transmitting is small, 97 + noise may dominate and disturb the adaption process. If we can ensure that the 98 + adaption is only performed when we are transmitting a significant signal level, 99 + and the environment is not, things will be OK. Clearly, it is easy to tell when 100 + we are sending a significant signal. Telling, if the environment is generating 101 + a significant signal, and doing it with sufficient speed that the adaption will 102 + not have diverged too much more we stop it, is a little harder. 103 + 104 + The key problem in detecting when the environment is sourcing significant 105 + energy is that we must do this very quickly. Given a reasonably long sample of 106 + the received signal, there are a number of strategies which may be used to 107 + assess whether that signal contains a strong far end component. However, by the 108 + time that assessment is complete the far end signal will have already caused 109 + major mis-convergence in the adaption process. An assessment algorithm is 110 + needed which produces a fairly accurate result from a very short burst of far 111 + end energy. 112 + 113 + \section echo_can_page_sec_3 How do I use it? 114 + The echo cancellor processes both the transmit and receive streams sample by 115 + sample. The processing function is not declared inline. Unfortunately, 116 + cancellation requires many operations per sample, so the call overhead is only 117 + a minor burden. 118 + */ 119 + 120 + #include "fir.h" 121 + 122 + /* Mask bits for the adaption mode */ 123 + #define ECHO_CAN_USE_ADAPTION 0x01 124 + #define ECHO_CAN_USE_NLP 0x02 125 + #define ECHO_CAN_USE_CNG 0x04 126 + #define ECHO_CAN_USE_CLIP 0x08 127 + #define ECHO_CAN_USE_TX_HPF 0x10 128 + #define ECHO_CAN_USE_RX_HPF 0x20 129 + #define ECHO_CAN_DISABLE 0x40 130 + 131 + /*! 132 + G.168 echo canceller descriptor. This defines the working state for a line 133 + echo canceller. 134 + */ 135 + typedef struct 136 + { 137 + int16_t tx,rx; 138 + int16_t clean; 139 + int16_t clean_nlp; 140 + 141 + int nonupdate_dwell; 142 + int curr_pos; 143 + int taps; 144 + int log2taps; 145 + int adaption_mode; 146 + 147 + int cond_met; 148 + int32_t Pstates; 149 + int16_t adapt; 150 + int32_t factor; 151 + int16_t shift; 152 + 153 + /* Average levels and averaging filter states */ 154 + int Ltxacc, Lrxacc, Lcleanacc, Lclean_bgacc; 155 + int Ltx, Lrx; 156 + int Lclean; 157 + int Lclean_bg; 158 + int Lbgn, Lbgn_acc, Lbgn_upper, Lbgn_upper_acc; 159 + 160 + /* foreground and background filter states */ 161 + fir16_state_t fir_state; 162 + fir16_state_t fir_state_bg; 163 + int16_t *fir_taps16[2]; 164 + 165 + /* DC blocking filter states */ 166 + int tx_1, tx_2, rx_1, rx_2; 167 + 168 + /* optional High Pass Filter states */ 169 + int32_t xvtx[5], yvtx[5]; 170 + int32_t xvrx[5], yvrx[5]; 171 + 172 + /* Parameters for the optional Hoth noise generator */ 173 + int cng_level; 174 + int cng_rndnum; 175 + int cng_filter; 176 + 177 + /* snapshot sample of coeffs used for development */ 178 + int16_t *snapshot; 179 + } echo_can_state_t; 180 + 181 + /*! Create a voice echo canceller context. 182 + \param len The length of the canceller, in samples. 183 + \return The new canceller context, or NULL if the canceller could not be created. 184 + */ 185 + echo_can_state_t *echo_can_create(int len, int adaption_mode); 186 + 187 + /*! Free a voice echo canceller context. 188 + \param ec The echo canceller context. 189 + */ 190 + void echo_can_free(echo_can_state_t *ec); 191 + 192 + /*! Flush (reinitialise) a voice echo canceller context. 193 + \param ec The echo canceller context. 194 + */ 195 + void echo_can_flush(echo_can_state_t *ec); 196 + 197 + /*! Set the adaption mode of a voice echo canceller context. 198 + \param ec The echo canceller context. 199 + \param adapt The mode. 200 + */ 201 + void echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode); 202 + 203 + void echo_can_snapshot(echo_can_state_t *ec); 204 + 205 + /*! Process a sample through a voice echo canceller. 206 + \param ec The echo canceller context. 207 + \param tx The transmitted audio sample. 208 + \param rx The received audio sample. 209 + \return The clean (echo cancelled) received sample. 210 + */ 211 + int16_t echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx); 212 + 213 + /*! Process to high pass filter the tx signal. 214 + \param ec The echo canceller context. 215 + \param tx The transmitted auio sample. 216 + \return The HP filtered transmit sample, send this to your D/A. 217 + */ 218 + int16_t echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx); 219 + 220 + #endif /* __ECHO_H */
+369
drivers/staging/echo/fir.h
··· 1 + /* 2 + * SpanDSP - a series of DSP components for telephony 3 + * 4 + * fir.h - General telephony FIR routines 5 + * 6 + * Written by Steve Underwood <steveu@coppice.org> 7 + * 8 + * Copyright (C) 2002 Steve Underwood 9 + * 10 + * All rights reserved. 11 + * 12 + * This program is free software; you can redistribute it and/or modify 13 + * it under the terms of the GNU General Public License version 2, as 14 + * published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope that it will be useful, 17 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 + * GNU General Public License for more details. 20 + * 21 + * You should have received a copy of the GNU General Public License 22 + * along with this program; if not, write to the Free Software 23 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 24 + * 25 + * $Id: fir.h,v 1.8 2006/10/24 13:45:28 steveu Exp $ 26 + */ 27 + 28 + /*! \page fir_page FIR filtering 29 + \section fir_page_sec_1 What does it do? 30 + ???. 31 + 32 + \section fir_page_sec_2 How does it work? 33 + ???. 34 + */ 35 + 36 + #if !defined(_FIR_H_) 37 + #define _FIR_H_ 38 + 39 + /* 40 + Blackfin NOTES & IDEAS: 41 + 42 + A simple dot product function is used to implement the filter. This performs 43 + just one MAC/cycle which is inefficient but was easy to implement as a first 44 + pass. The current Blackfin code also uses an unrolled form of the filter 45 + history to avoid 0 length hardware loop issues. This is wasteful of 46 + memory. 47 + 48 + Ideas for improvement: 49 + 50 + 1/ Rewrite filter for dual MAC inner loop. The issue here is handling 51 + history sample offsets that are 16 bit aligned - the dual MAC needs 52 + 32 bit aligmnent. There are some good examples in libbfdsp. 53 + 54 + 2/ Use the hardware circular buffer facility tohalve memory usage. 55 + 56 + 3/ Consider using internal memory. 57 + 58 + Using less memory might also improve speed as cache misses will be 59 + reduced. A drop in MIPs and memory approaching 50% should be 60 + possible. 61 + 62 + The foreground and background filters currenlty use a total of 63 + about 10 MIPs/ch as measured with speedtest.c on a 256 TAP echo 64 + can. 65 + */ 66 + 67 + #if defined(USE_MMX) || defined(USE_SSE2) 68 + #include "mmx.h" 69 + #endif 70 + 71 + /*! 72 + 16 bit integer FIR descriptor. This defines the working state for a single 73 + instance of an FIR filter using 16 bit integer coefficients. 74 + */ 75 + typedef struct 76 + { 77 + int taps; 78 + int curr_pos; 79 + const int16_t *coeffs; 80 + int16_t *history; 81 + } fir16_state_t; 82 + 83 + /*! 84 + 32 bit integer FIR descriptor. This defines the working state for a single 85 + instance of an FIR filter using 32 bit integer coefficients, and filtering 86 + 16 bit integer data. 87 + */ 88 + typedef struct 89 + { 90 + int taps; 91 + int curr_pos; 92 + const int32_t *coeffs; 93 + int16_t *history; 94 + } fir32_state_t; 95 + 96 + /*! 97 + Floating point FIR descriptor. This defines the working state for a single 98 + instance of an FIR filter using floating point coefficients and data. 99 + */ 100 + typedef struct 101 + { 102 + int taps; 103 + int curr_pos; 104 + const float *coeffs; 105 + float *history; 106 + } fir_float_state_t; 107 + 108 + #ifdef __cplusplus 109 + extern "C" { 110 + #endif 111 + 112 + static __inline__ const int16_t *fir16_create(fir16_state_t *fir, 113 + const int16_t *coeffs, 114 + int taps) 115 + { 116 + fir->taps = taps; 117 + fir->curr_pos = taps - 1; 118 + fir->coeffs = coeffs; 119 + #if defined(USE_MMX) || defined(USE_SSE2) || defined(__BLACKFIN_ASM__) 120 + if ((fir->history = malloc(2*taps*sizeof(int16_t)))) 121 + memset(fir->history, 0, 2*taps*sizeof(int16_t)); 122 + #else 123 + if ((fir->history = (int16_t *) malloc(taps*sizeof(int16_t)))) 124 + memset(fir->history, 0, taps*sizeof(int16_t)); 125 + #endif 126 + return fir->history; 127 + } 128 + /*- End of function --------------------------------------------------------*/ 129 + 130 + static __inline__ void fir16_flush(fir16_state_t *fir) 131 + { 132 + #if defined(USE_MMX) || defined(USE_SSE2) || defined(__BLACKFIN_ASM__) 133 + memset(fir->history, 0, 2*fir->taps*sizeof(int16_t)); 134 + #else 135 + memset(fir->history, 0, fir->taps*sizeof(int16_t)); 136 + #endif 137 + } 138 + /*- End of function --------------------------------------------------------*/ 139 + 140 + static __inline__ void fir16_free(fir16_state_t *fir) 141 + { 142 + free(fir->history); 143 + } 144 + /*- End of function --------------------------------------------------------*/ 145 + 146 + #ifdef __BLACKFIN_ASM__ 147 + static inline int32_t dot_asm(short *x, short *y, int len) 148 + { 149 + int dot; 150 + 151 + len--; 152 + 153 + __asm__ 154 + ( 155 + "I0 = %1;\n\t" 156 + "I1 = %2;\n\t" 157 + "A0 = 0;\n\t" 158 + "R0.L = W[I0++] || R1.L = W[I1++];\n\t" 159 + "LOOP dot%= LC0 = %3;\n\t" 160 + "LOOP_BEGIN dot%=;\n\t" 161 + "A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t" 162 + "LOOP_END dot%=;\n\t" 163 + "A0 += R0.L*R1.L (IS);\n\t" 164 + "R0 = A0;\n\t" 165 + "%0 = R0;\n\t" 166 + : "=&d" (dot) 167 + : "a" (x), "a" (y), "a" (len) 168 + : "I0", "I1", "A1", "A0", "R0", "R1" 169 + ); 170 + 171 + return dot; 172 + } 173 + #endif 174 + /*- End of function --------------------------------------------------------*/ 175 + 176 + static __inline__ int16_t fir16(fir16_state_t *fir, int16_t sample) 177 + { 178 + int32_t y; 179 + #if defined(USE_MMX) 180 + int i; 181 + mmx_t *mmx_coeffs; 182 + mmx_t *mmx_hist; 183 + 184 + fir->history[fir->curr_pos] = sample; 185 + fir->history[fir->curr_pos + fir->taps] = sample; 186 + 187 + mmx_coeffs = (mmx_t *) fir->coeffs; 188 + mmx_hist = (mmx_t *) &fir->history[fir->curr_pos]; 189 + i = fir->taps; 190 + pxor_r2r(mm4, mm4); 191 + /* 8 samples per iteration, so the filter must be a multiple of 8 long. */ 192 + while (i > 0) 193 + { 194 + movq_m2r(mmx_coeffs[0], mm0); 195 + movq_m2r(mmx_coeffs[1], mm2); 196 + movq_m2r(mmx_hist[0], mm1); 197 + movq_m2r(mmx_hist[1], mm3); 198 + mmx_coeffs += 2; 199 + mmx_hist += 2; 200 + pmaddwd_r2r(mm1, mm0); 201 + pmaddwd_r2r(mm3, mm2); 202 + paddd_r2r(mm0, mm4); 203 + paddd_r2r(mm2, mm4); 204 + i -= 8; 205 + } 206 + movq_r2r(mm4, mm0); 207 + psrlq_i2r(32, mm0); 208 + paddd_r2r(mm0, mm4); 209 + movd_r2m(mm4, y); 210 + emms(); 211 + #elif defined(USE_SSE2) 212 + int i; 213 + xmm_t *xmm_coeffs; 214 + xmm_t *xmm_hist; 215 + 216 + fir->history[fir->curr_pos] = sample; 217 + fir->history[fir->curr_pos + fir->taps] = sample; 218 + 219 + xmm_coeffs = (xmm_t *) fir->coeffs; 220 + xmm_hist = (xmm_t *) &fir->history[fir->curr_pos]; 221 + i = fir->taps; 222 + pxor_r2r(xmm4, xmm4); 223 + /* 16 samples per iteration, so the filter must be a multiple of 16 long. */ 224 + while (i > 0) 225 + { 226 + movdqu_m2r(xmm_coeffs[0], xmm0); 227 + movdqu_m2r(xmm_coeffs[1], xmm2); 228 + movdqu_m2r(xmm_hist[0], xmm1); 229 + movdqu_m2r(xmm_hist[1], xmm3); 230 + xmm_coeffs += 2; 231 + xmm_hist += 2; 232 + pmaddwd_r2r(xmm1, xmm0); 233 + pmaddwd_r2r(xmm3, xmm2); 234 + paddd_r2r(xmm0, xmm4); 235 + paddd_r2r(xmm2, xmm4); 236 + i -= 16; 237 + } 238 + movdqa_r2r(xmm4, xmm0); 239 + psrldq_i2r(8, xmm0); 240 + paddd_r2r(xmm0, xmm4); 241 + movdqa_r2r(xmm4, xmm0); 242 + psrldq_i2r(4, xmm0); 243 + paddd_r2r(xmm0, xmm4); 244 + movd_r2m(xmm4, y); 245 + #elif defined(__BLACKFIN_ASM__) 246 + fir->history[fir->curr_pos] = sample; 247 + fir->history[fir->curr_pos + fir->taps] = sample; 248 + y = dot_asm((int16_t*)fir->coeffs, &fir->history[fir->curr_pos], fir->taps); 249 + #else 250 + int i; 251 + int offset1; 252 + int offset2; 253 + 254 + fir->history[fir->curr_pos] = sample; 255 + 256 + offset2 = fir->curr_pos; 257 + offset1 = fir->taps - offset2; 258 + y = 0; 259 + for (i = fir->taps - 1; i >= offset1; i--) 260 + y += fir->coeffs[i]*fir->history[i - offset1]; 261 + for ( ; i >= 0; i--) 262 + y += fir->coeffs[i]*fir->history[i + offset2]; 263 + #endif 264 + if (fir->curr_pos <= 0) 265 + fir->curr_pos = fir->taps; 266 + fir->curr_pos--; 267 + return (int16_t) (y >> 15); 268 + } 269 + /*- End of function --------------------------------------------------------*/ 270 + 271 + static __inline__ const int16_t *fir32_create(fir32_state_t *fir, 272 + const int32_t *coeffs, 273 + int taps) 274 + { 275 + fir->taps = taps; 276 + fir->curr_pos = taps - 1; 277 + fir->coeffs = coeffs; 278 + fir->history = (int16_t *) malloc(taps*sizeof(int16_t)); 279 + if (fir->history) 280 + memset(fir->history, '\0', taps*sizeof(int16_t)); 281 + return fir->history; 282 + } 283 + /*- End of function --------------------------------------------------------*/ 284 + 285 + static __inline__ void fir32_flush(fir32_state_t *fir) 286 + { 287 + memset(fir->history, 0, fir->taps*sizeof(int16_t)); 288 + } 289 + /*- End of function --------------------------------------------------------*/ 290 + 291 + static __inline__ void fir32_free(fir32_state_t *fir) 292 + { 293 + free(fir->history); 294 + } 295 + /*- End of function --------------------------------------------------------*/ 296 + 297 + static __inline__ int16_t fir32(fir32_state_t *fir, int16_t sample) 298 + { 299 + int i; 300 + int32_t y; 301 + int offset1; 302 + int offset2; 303 + 304 + fir->history[fir->curr_pos] = sample; 305 + offset2 = fir->curr_pos; 306 + offset1 = fir->taps - offset2; 307 + y = 0; 308 + for (i = fir->taps - 1; i >= offset1; i--) 309 + y += fir->coeffs[i]*fir->history[i - offset1]; 310 + for ( ; i >= 0; i--) 311 + y += fir->coeffs[i]*fir->history[i + offset2]; 312 + if (fir->curr_pos <= 0) 313 + fir->curr_pos = fir->taps; 314 + fir->curr_pos--; 315 + return (int16_t) (y >> 15); 316 + } 317 + /*- End of function --------------------------------------------------------*/ 318 + 319 + #ifndef __KERNEL__ 320 + static __inline__ const float *fir_float_create(fir_float_state_t *fir, 321 + const float *coeffs, 322 + int taps) 323 + { 324 + fir->taps = taps; 325 + fir->curr_pos = taps - 1; 326 + fir->coeffs = coeffs; 327 + fir->history = (float *) malloc(taps*sizeof(float)); 328 + if (fir->history) 329 + memset(fir->history, '\0', taps*sizeof(float)); 330 + return fir->history; 331 + } 332 + /*- End of function --------------------------------------------------------*/ 333 + 334 + static __inline__ void fir_float_free(fir_float_state_t *fir) 335 + { 336 + free(fir->history); 337 + } 338 + /*- End of function --------------------------------------------------------*/ 339 + 340 + static __inline__ int16_t fir_float(fir_float_state_t *fir, int16_t sample) 341 + { 342 + int i; 343 + float y; 344 + int offset1; 345 + int offset2; 346 + 347 + fir->history[fir->curr_pos] = sample; 348 + 349 + offset2 = fir->curr_pos; 350 + offset1 = fir->taps - offset2; 351 + y = 0; 352 + for (i = fir->taps - 1; i >= offset1; i--) 353 + y += fir->coeffs[i]*fir->history[i - offset1]; 354 + for ( ; i >= 0; i--) 355 + y += fir->coeffs[i]*fir->history[i + offset2]; 356 + if (fir->curr_pos <= 0) 357 + fir->curr_pos = fir->taps; 358 + fir->curr_pos--; 359 + return (int16_t) y; 360 + } 361 + /*- End of function --------------------------------------------------------*/ 362 + #endif 363 + 364 + #ifdef __cplusplus 365 + } 366 + #endif 367 + 368 + #endif 369 + /*- End of file ------------------------------------------------------------*/
+288
drivers/staging/echo/mmx.h
··· 1 + /* 2 + * mmx.h 3 + * Copyright (C) 1997-2001 H. Dietz and R. Fisher 4 + * 5 + * This file is part of FFmpeg. 6 + * 7 + * FFmpeg is free software; you can redistribute it and/or 8 + * modify it under the terms of the GNU Lesser General Public 9 + * License as published by the Free Software Foundation; either 10 + * version 2.1 of the License, or (at your option) any later version. 11 + * 12 + * FFmpeg is distributed in the hope that it will be useful, 13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 + * Lesser General Public License for more details. 16 + * 17 + * You should have received a copy of the GNU Lesser General Public 18 + * License along with FFmpeg; if not, write to the Free Software 19 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 + */ 21 + #ifndef AVCODEC_I386MMX_H 22 + #define AVCODEC_I386MMX_H 23 + 24 + /* 25 + * The type of an value that fits in an MMX register (note that long 26 + * long constant values MUST be suffixed by LL and unsigned long long 27 + * values by ULL, lest they be truncated by the compiler) 28 + */ 29 + 30 + typedef union { 31 + long long q; /* Quadword (64-bit) value */ 32 + unsigned long long uq; /* Unsigned Quadword */ 33 + int d[2]; /* 2 Doubleword (32-bit) values */ 34 + unsigned int ud[2]; /* 2 Unsigned Doubleword */ 35 + short w[4]; /* 4 Word (16-bit) values */ 36 + unsigned short uw[4]; /* 4 Unsigned Word */ 37 + char b[8]; /* 8 Byte (8-bit) values */ 38 + unsigned char ub[8]; /* 8 Unsigned Byte */ 39 + float s[2]; /* Single-precision (32-bit) value */ 40 + } mmx_t; /* On an 8-byte (64-bit) boundary */ 41 + 42 + /* SSE registers */ 43 + typedef union { 44 + char b[16]; 45 + } xmm_t; 46 + 47 + 48 + #define mmx_i2r(op,imm,reg) \ 49 + __asm__ __volatile__ (#op " %0, %%" #reg \ 50 + : /* nothing */ \ 51 + : "i" (imm) ) 52 + 53 + #define mmx_m2r(op,mem,reg) \ 54 + __asm__ __volatile__ (#op " %0, %%" #reg \ 55 + : /* nothing */ \ 56 + : "m" (mem)) 57 + 58 + #define mmx_r2m(op,reg,mem) \ 59 + __asm__ __volatile__ (#op " %%" #reg ", %0" \ 60 + : "=m" (mem) \ 61 + : /* nothing */ ) 62 + 63 + #define mmx_r2r(op,regs,regd) \ 64 + __asm__ __volatile__ (#op " %" #regs ", %" #regd) 65 + 66 + 67 + #define emms() __asm__ __volatile__ ("emms") 68 + 69 + #define movd_m2r(var,reg) mmx_m2r (movd, var, reg) 70 + #define movd_r2m(reg,var) mmx_r2m (movd, reg, var) 71 + #define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) 72 + 73 + #define movq_m2r(var,reg) mmx_m2r (movq, var, reg) 74 + #define movq_r2m(reg,var) mmx_r2m (movq, reg, var) 75 + #define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) 76 + 77 + #define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) 78 + #define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) 79 + #define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) 80 + #define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) 81 + 82 + #define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) 83 + #define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) 84 + 85 + #define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) 86 + #define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) 87 + #define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) 88 + #define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) 89 + #define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) 90 + #define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) 91 + 92 + #define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) 93 + #define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) 94 + #define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) 95 + #define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) 96 + 97 + #define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) 98 + #define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) 99 + #define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) 100 + #define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) 101 + 102 + #define pand_m2r(var,reg) mmx_m2r (pand, var, reg) 103 + #define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) 104 + 105 + #define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) 106 + #define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) 107 + 108 + #define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) 109 + #define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) 110 + #define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) 111 + #define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) 112 + #define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) 113 + #define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) 114 + 115 + #define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) 116 + #define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) 117 + #define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) 118 + #define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) 119 + #define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) 120 + #define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) 121 + 122 + #define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) 123 + #define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) 124 + 125 + #define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) 126 + #define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) 127 + 128 + #define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) 129 + #define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) 130 + 131 + #define por_m2r(var,reg) mmx_m2r (por, var, reg) 132 + #define por_r2r(regs,regd) mmx_r2r (por, regs, regd) 133 + 134 + #define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) 135 + #define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) 136 + #define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) 137 + #define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) 138 + #define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) 139 + #define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) 140 + #define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) 141 + #define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) 142 + #define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) 143 + 144 + #define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) 145 + #define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) 146 + #define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) 147 + #define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) 148 + #define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) 149 + #define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) 150 + 151 + #define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) 152 + #define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) 153 + #define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) 154 + #define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) 155 + #define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) 156 + #define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) 157 + #define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) 158 + #define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) 159 + #define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) 160 + 161 + #define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) 162 + #define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) 163 + #define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) 164 + #define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) 165 + #define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) 166 + #define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) 167 + 168 + #define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) 169 + #define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) 170 + #define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) 171 + #define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) 172 + 173 + #define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) 174 + #define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) 175 + #define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) 176 + #define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) 177 + 178 + #define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) 179 + #define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) 180 + #define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) 181 + #define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) 182 + #define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) 183 + #define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) 184 + 185 + #define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) 186 + #define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) 187 + #define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) 188 + #define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) 189 + #define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) 190 + #define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) 191 + 192 + #define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) 193 + #define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) 194 + 195 + 196 + /* 3DNOW extensions */ 197 + 198 + #define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) 199 + #define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) 200 + 201 + 202 + /* AMD MMX extensions - also available in intel SSE */ 203 + 204 + 205 + #define mmx_m2ri(op,mem,reg,imm) \ 206 + __asm__ __volatile__ (#op " %1, %0, %%" #reg \ 207 + : /* nothing */ \ 208 + : "m" (mem), "i" (imm)) 209 + #define mmx_r2ri(op,regs,regd,imm) \ 210 + __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \ 211 + : /* nothing */ \ 212 + : "i" (imm) ) 213 + 214 + #define mmx_fetch(mem,hint) \ 215 + __asm__ __volatile__ ("prefetch" #hint " %0" \ 216 + : /* nothing */ \ 217 + : "m" (mem)) 218 + 219 + 220 + #define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) 221 + 222 + #define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) 223 + 224 + #define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) 225 + #define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) 226 + #define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) 227 + #define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) 228 + 229 + #define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) 230 + 231 + #define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) 232 + 233 + #define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) 234 + #define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) 235 + 236 + #define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) 237 + #define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) 238 + 239 + #define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) 240 + #define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) 241 + 242 + #define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) 243 + #define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) 244 + 245 + #define pmovmskb(mmreg,reg) \ 246 + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) 247 + 248 + #define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) 249 + #define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) 250 + 251 + #define prefetcht0(mem) mmx_fetch (mem, t0) 252 + #define prefetcht1(mem) mmx_fetch (mem, t1) 253 + #define prefetcht2(mem) mmx_fetch (mem, t2) 254 + #define prefetchnta(mem) mmx_fetch (mem, nta) 255 + 256 + #define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) 257 + #define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) 258 + 259 + #define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) 260 + #define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) 261 + 262 + #define sfence() __asm__ __volatile__ ("sfence\n\t") 263 + 264 + /* SSE2 */ 265 + #define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm) 266 + #define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm) 267 + #define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm) 268 + #define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm) 269 + 270 + #define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm) 271 + 272 + #define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg) 273 + #define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var) 274 + #define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd) 275 + #define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg) 276 + #define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var) 277 + #define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd) 278 + 279 + #define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var) 280 + 281 + #define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg) 282 + #define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg) 283 + 284 + #define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd) 285 + #define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd) 286 + 287 + 288 + #endif /* AVCODEC_I386MMX_H */