123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599 |
- /* Copyright (C) 2007 Hong Zhiqian */
- /**
- @file kiss_fft_tm.h
- @author Hong Zhiqian
- @brief Various compatibility routines for Speex (TriMedia version)
- */
- /*
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- - Neither the name of the Xiph.org Foundation nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- #include "_kiss_fft_guts_tm.h"
- #ifdef TM_ASM
- #include "profile_tm.h"
- #ifdef FIXED_POINT
- #define OVERRIDE_KFBFLY2
- static void kf_bfly2(
- kiss_fft_cpx *Fout,
- const int fstride,
- const kiss_fft_cfg st,
- int m
- )
- {
- register int * restrict Fout2;
- register int * restrict tw1 = (int*)st->twiddles;
- register int i, j;
- register int _inv = !st->inverse;
-
- Fout2 = (int*)Fout + m;
-
- for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride )
- { register int tw_10, ff_10, f2_10;
- ff_10 = ld32x(Fout, i);
- f2_10 = ld32x(Fout2, i);
- tw_10 = ld32(tw1);
-
- if ( _inv )
- { TM_SHR(f2_10, f2_10, 1);
- TM_SHR(ff_10, ff_10, 1);
- }
- TM_MUL(tw_10, tw_10, f2_10);
- TM_SUB(f2_10, ff_10, tw_10);
- TM_ADD(ff_10, ff_10, tw_10);
- st32d(j, Fout2, f2_10);
- st32d(j, Fout, ff_10);
- }
- }
- #define OVERRIDE_KFBFLY4
- static void kf_bfly4(
- kiss_fft_cpx *Fout,
- const int fstride,
- const kiss_fft_cfg st,
- const int m
- )
- {
- register int * restrict tw1;
- register int * restrict tw2;
- register int * restrict tw3;
- register int * restrict Fout1;
- register int * restrict Fout2;
- register int * restrict Fout3;
- register int i, j;
- register int fstride2, fstride3;
- register int _inv = !st->inverse;
- tw3 = tw2 = tw1 = (int*)st->twiddles;
- fstride2 = fstride << 1;
- fstride3 = fstride * 3;
- Fout1 = (int*)Fout + m;
- Fout2 = (int*)Fout + (m << 1);
- Fout3 = (int*)Fout + (m * 3);
- for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3 )
- { register int sc0, sc1, sc2, sc3, sc4, sc5;
- register int ff0;
- sc0 = ld32x(Fout1,i);
- sc3 = ld32(tw1);
- sc1 = ld32x(Fout2, i);
- sc4 = ld32(tw2);
- sc2 = ld32x(Fout3, i);
- sc5 = ld32(tw3);
- ff0 = ld32x(Fout,i);
-
- if ( _inv )
- {
- TM_ADD(sc0, sc0, 0x00020002);
- TM_ADD(sc1, sc1, 0x00020002);
- TM_ADD(sc2, sc2, 0x00020002);
- TM_ADD(ff0, ff0, 0x00020002);
- TM_SHR(sc0, sc0, 2);
- TM_SHR(sc1, sc1, 2);
- TM_SHR(sc2, sc2, 2);
- TM_SHR(ff0, ff0, 2);
- }
- TM_MUL(sc0, sc0, sc3);
- TM_MUL(sc1, sc1, sc4);
- TM_MUL(sc2, sc2, sc5);
- TM_SUB(sc5, ff0, sc1);
- TM_ADD(ff0, ff0, sc1);
- TM_ADD(sc3, sc0, sc2);
- TM_SUB(sc4, sc0, sc2);
- TM_SUB(sc1, ff0, sc3);
- TM_ADD(ff0, ff0, sc3);
-
- st32d(j, Fout2, sc1);
- st32d(j, Fout, ff0);
- sc5 = funshift2(sc5, sc5);
-
- if ( _inv )
- { TM_ADD(ff0, sc5, sc4);
- TM_SUB(sc1, sc5, sc4);
- } else
- { TM_ADD(sc1, sc5, sc4);
- TM_SUB(ff0, sc5, sc4);
- }
- sc0 = funshift2(sc1, ff0);
- sc2 = funshift2(ff0, sc1);
- st32d(j, Fout1, sc0);
- st32d(j, Fout3, sc2);
- }
- }
- #define OVERRIDE_KFBFLY3
- static void kf_bfly3(
- kiss_fft_cpx *Fout,
- const int fstride,
- const kiss_fft_cfg st,
- int m
- )
- {
- register int * restrict tw1;
- register int * restrict tw2;
- register int * restrict Fout1;
- register int * restrict Fout2;
- register int epi;
- register int i, j;
- register int fstride2;
- register int _inv = !st->inverse;
- tw1 = tw2 = (int*)st->twiddles;
- Fout1 = (int*)Fout + m;
- Fout2 = (int*)Fout + (m << 1);
- epi = tw1[fstride*m];
- epi = pack16lsb(epi,epi);
- fstride2 = fstride << 1;
- for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2 )
- { register int sc0, sc1, sc2, sc3, sc4, sc5;
- register int ff0;
- sc1 = ld32x(Fout1,i);
- sc2 = ld32x(Fout2,i);
- sc3 = ld32(tw1);
- sc4 = ld32(tw2);
- ff0 = ld32x(Fout,i);
-
- if ( _inv )
- {
- TM_DIV(sc1, sc1, 3);
- TM_DIV(sc2, sc2, 3);
- TM_DIV(ff0, ff0, 3);
- }
- TM_MUL(sc1, sc1, sc3);
- TM_MUL(sc2, sc2, sc4);
- TM_ADD(sc3, sc1, sc2);
- TM_SUB(sc0, sc1, sc2);
- TM_SHR(sc4, sc3, 1);
- TM_SUB(sc1, ff0, sc4);
- sc0 = dspidualmul(sc0, epi);
- sc0 = funshift2(sc0, sc0);
- TM_ADD(ff0, ff0, sc3);
- TM_ADD(sc4, sc1, sc0);
- TM_SUB(sc5, sc1, sc0);
- sc1 = funshift2(sc4, sc5);
- sc2 = funshift2(sc5, sc4);
- sc2 = funshift2(sc2, sc2);
- st32d(j, Fout1, sc1);
- st32d(j, Fout, ff0);
- st32d(j, Fout2, sc2);
- }
- }
- #define OVERRIDE_KFBFLY5
- static void kf_bfly5(
- kiss_fft_cpx *Fout,
- const int fstride,
- const kiss_fft_cfg st,
- int m
- )
- {
- register int * restrict tw1;
- register int * restrict tw2;
- register int * restrict tw3;
- register int * restrict tw4;
- register int * restrict Fout1;
- register int * restrict Fout2;
- register int * restrict Fout3;
- register int * restrict Fout4;
- register int fstride2, fstride3, fstride4;
- register int i, j;
- register int yab_msb, yab_lsb, yba_msb, yba_lsb;
- register int _inv = !st->inverse;
- Fout1=(int*)Fout+m;
- Fout2=(int*)Fout+(m<<1);
- Fout3=(int*)Fout+(3 *m);
- Fout4=(int*)Fout+(m<<2);
- tw1 = tw2 = tw3 = tw4 = (int*)st->twiddles;
- i = tw1[fstride*m];
- yab_lsb = tw1[fstride*(m<<1)];
- yab_msb = pack16msb(i, yab_lsb);
- yab_lsb = pack16lsb(i, yab_lsb);
- yba_msb = funshift2(-sex16(yab_msb), yab_msb);
- yba_lsb = funshift2(yab_lsb, yab_lsb);
- fstride2 = fstride << 1;
- fstride3 = fstride * 3;
- fstride4 = fstride << 2;
- for ( i=0,j=0 ; i<m ; ++i,j+=4,tw1+=fstride,tw2+=fstride2,tw3+=fstride3,tw4+=fstride4 )
- { register int sc0, sc1, sc2, sc3, sc4, sc5, sc6;
- register int sc7, sc8, sc9, sc10, sc11, sc12;
- register int ff0, sc78_msb, sc78_lsb, sc90_msb, sc90_lsb;
- sc0 = ld32x(Fout,i);
- sc1 = ld32x(Fout1,i);
- sc2 = ld32x(Fout2,i);
- sc3 = ld32x(Fout3,i);
- sc4 = ld32x(Fout4,i);
- sc5 = ld32(tw1);
- sc6 = ld32(tw2);
- sc7 = ld32(tw3);
- sc8 = ld32(tw4);
- if ( _inv )
- {
- TM_DIV(sc0, sc0, 5);
- TM_DIV(sc1, sc1, 5);
- TM_DIV(sc2, sc2, 5);
- TM_DIV(sc3, sc3, 5);
- TM_DIV(sc4, sc4, 5);
- }
- ff0 = sc0;
- TM_MUL(sc1, sc1, sc5);
- TM_MUL(sc2, sc2, sc6);
- TM_MUL(sc3, sc3, sc7);
- TM_MUL(sc4, sc4, sc8);
- TM_ADD(sc7, sc1, sc4);
- TM_SUB(sc10,sc1, sc4);
- TM_ADD(sc8, sc2, sc3);
- TM_SUB(sc9, sc2, sc3);
- TM_ADD(ff0, ff0, sc7);
- TM_ADD(ff0, ff0, sc8);
- st32d(j, Fout, ff0);
- sc78_msb = pack16msb(sc7,sc8);
- sc78_lsb = pack16lsb(sc7,sc8);
- sc90_msb = pack16msb(sc10,sc9);
- sc90_lsb = pack16lsb(sc10,sc9);
-
- sc5 = pack16lsb( sround(ifir16(sc78_msb,yab_lsb)), sround(ifir16(sc78_lsb,yab_lsb)));
- sc6 = pack16lsb(-sround(ifir16(sc90_lsb,yab_msb)), sround(ifir16(sc90_msb,yab_msb)));
- TM_ADD(sc5, sc5, sc0);
- TM_SUB(sc1, sc5, sc6);
- TM_ADD(sc4, sc5, sc6);
- st32d(j, Fout1, sc1);
- st32d(j, Fout4, sc4);
- sc11 = pack16lsb( sround(ifir16(sc78_msb,yba_lsb)), sround(ifir16(sc78_lsb,yba_lsb)));
- sc12 = pack16lsb(-sround(ifir16(sc90_lsb,yba_msb)), sround(ifir16(sc90_msb,yba_msb)));
- TM_ADD(sc11, sc11, sc0);
- TM_ADD(sc2, sc11, sc12);
- TM_SUB(sc3, sc11, sc12);
- st32d(j, Fout2, sc2);
- st32d(j, Fout3, sc3);
- }
- }
- #define OVERRIDE_KF_BFLY_GENERIC
- static void kf_bfly_generic(
- kiss_fft_cpx * restrict Fout,
- const size_t fstride,
- const kiss_fft_cfg st,
- int m,
- int p
- )
- {
- register int _inv = !st->inverse;
- register int i, j, k, l;
- register int * restrict twiddles = (int*)st->twiddles;
- register int Norig = st->nfft;
- CHECKBUF(scratchbuf,nscratchbuf,p);
- for ( i=0; i<m; ++i )
- { register int sc10;
- for ( j=0,k=i ; j<p ; ++j,k+=m )
- { register int f10;
- f10 = ld32x(Fout,k);
- if ( _inv )
- { TM_DIV(f10, f10, p);
- }
- st32d(j<<2, scratchbuf, f10);
- }
- for ( j=0,k=i,sc10=ld32(scratchbuf) ; j<p ; ++j,k+=m )
- {
- register int twidx = 0;
- register int f10;
- for ( l=1,f10 = sc10 ; l<p ; ++l )
- { register int tw, sc;
- twidx += fstride * k;
- if ( twidx>=Norig )
- { twidx -= Norig;
- }
-
- sc = ld32x(scratchbuf,l);
- tw = ld32x(twiddles,twidx);
-
- TM_MUL(sc, sc, tw);
- TM_ADD(f10, f10, sc);
- }
- st32d(k<<2, Fout, f10);
- }
- }
- }
- #else
- #define OVERRIDE_KFBFLY2
- static void kf_bfly2(
- kiss_fft_cpx * Fout,
- const size_t fstride,
- const kiss_fft_cfg st,
- int m
- )
- {
- register kiss_fft_cpx * restrict Fout2;
- register kiss_fft_cpx * restrict tw1 = st->twiddles;
- Fout2 = Fout + m;
- do
- {
- register kiss_fft_cpx _fout2, _fout, t;
-
- _fout2 = *Fout2;
- _fout = *Fout;
-
- C_MUL ( t, _fout2, *tw1);
- C_SUB (_fout2, _fout, t);
- C_ADD (_fout, _fout, t);
-
- *Fout2 = _fout2;
- *Fout = _fout;
-
- tw1 += fstride;
- ++Fout2;
- ++Fout;
- } while ( --m );
- }
- #define OVERRIDE_KFBFLY4
- static void kf_bfly4(
- kiss_fft_cpx * Fout,
- const int fstride,
- const kiss_fft_cfg st,
- int m
- )
- {
- register kiss_fft_cpx * restrict tw1,* restrict tw2,* restrict tw3;
- register kiss_fft_cpx * restrict Fout1, * restrict Fout2, * restrict Fout3;
- register int _inv = !st->inverse;
-
- tw3 = tw2 = tw1 = st->twiddles;
-
- Fout1 = Fout + m;
- Fout2 = Fout + (m << 1);
- Fout3 = Fout + (m * 3);
- do {
-
- register kiss_fft_cpx _fout;
- register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5;
-
- _fout = *Fout;
-
- C_MUL( sc0,*Fout1, *tw1);
- C_MUL( sc1,*Fout2, *tw2);
- C_MUL( sc2,*Fout3, *tw3);
- C_SUB( sc5, _fout, sc1);
- C_ADD( _fout, _fout, sc1);
- C_ADD( sc3, sc0, sc2);
- C_SUB( sc4, sc0, sc2);
- C_SUB(*Fout2, _fout, sc3);
- C_ADD( *Fout, _fout, sc3);
- tw1 += fstride;
- tw2 += (fstride << 1);
- tw3 += (fstride * 3);
-
- if ( _inv )
- {
- Fout1->r = sc5.r + sc4.i;
- Fout1->i = sc5.i - sc4.r;
- Fout3->r = sc5.r - sc4.i;
- Fout3->i = sc5.i + sc4.r;
- }
- else
- { Fout1->r = sc5.r - sc4.i;
- Fout1->i = sc5.i + sc4.r;
- Fout3->r = sc5.r + sc4.i;
- Fout3->i = sc5.i - sc4.r;
- }
-
-
- ++Fout; ++Fout1; ++Fout2; ++Fout3;
-
- } while(--m);
- }
- #define OVERRIDE_KFBFLY3
- static void kf_bfly3(
- kiss_fft_cpx * Fout,
- const int fstride,
- const kiss_fft_cfg st,
- int m
- )
- {
- register kiss_fft_cpx * restrict Fout1, * restrict Fout2;
- register kiss_fft_cpx * restrict tw1,* restrict tw2;
- register float epi;
-
- tw1 = tw2 = st->twiddles;
- epi = st->twiddles[fstride*m].i;
- Fout1 = Fout + m;
- Fout2 = Fout + (m << 1);
- do {
-
- register kiss_fft_cpx _fout;
- register kiss_fft_cpx sc0, sc1, sc2, sc3;
-
- _fout = *Fout;
-
- C_MUL( sc1, *Fout1, *tw1);
- C_MUL( sc2, *Fout2, *tw2);
- C_ADD( sc3, sc1, sc2);
- C_SUB( sc0, sc1, sc2);
- tw1 += fstride;
- tw2 += (fstride << 1);
-
- sc1.r = _fout.r - HALF_OF(sc3.r);
- sc1.i = _fout.i - HALF_OF(sc3.i);
-
- C_MULBYSCALAR(sc0, epi);
- C_ADD(*Fout, _fout, sc3);
-
- Fout2->r = sc1.r + sc0.i;
- Fout2->i = sc1.i - sc0.r;
-
- Fout1->r = sc1.i - sc0.i;
- Fout1->i = sc1.r + sc0.r;
-
- ++Fout; ++Fout1; ++Fout2;
-
- } while(--m);
- }
- #define OVERRIDE_KFBFLY5
- static void kf_bfly5(
- kiss_fft_cpx * Fout,
- const size_t fstride,
- const kiss_fft_cfg st,
- int m
- )
- {
- register kiss_fft_cpx * restrict Fout1,* restrict Fout2,* restrict Fout3,* restrict Fout4;
- register int u;
- register kiss_fft_cpx *tw;
- register float yar, yai, ybr, ybi;
- Fout1=Fout+m;
- Fout2=Fout+(m<<1);
- Fout3=Fout+(m*3);
- Fout4=Fout+(m<<2);
- tw = st->twiddles;
- yar = tw[fstride*m].r;
- yai = tw[fstride*m].i;
- ybr = tw[fstride*2*m].r;
- ybi = tw[fstride*2*m].i;
-
- for ( u=0; u<m; ++u )
- {
- register kiss_fft_cpx sc0, sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc10, sc11, sc12;
-
- sc0 = *Fout;
-
- C_MUL( sc1,*Fout1, tw[u*fstride]);
- C_MUL( sc2,*Fout2, tw[2*u*fstride]);
- C_MUL( sc3,*Fout3, tw[3*u*fstride]);
- C_MUL( sc4,*Fout4, tw[4*u*fstride]);
-
- C_ADD( sc7, sc1, sc4);
- C_SUB( sc10, sc1, sc4);
- C_ADD( sc8, sc2, sc3);
- C_SUB( sc9, sc2, sc3);
-
- Fout->r = sc0.r + sc7.r + sc8.r;
- Fout->i = sc0.i + sc7.i + sc8.i;
-
- sc5.r = sc0.r + S_MUL(sc7.r,yar) + S_MUL(sc8.r,ybr);
- sc5.i = sc0.i + S_MUL(sc7.i,yar) + S_MUL(sc8.i,ybr);
-
- sc6.r = S_MUL(sc10.i,yai) + S_MUL(sc9.i,ybi);
- sc6.i = -S_MUL(sc10.r,yai) - S_MUL(sc9.r,ybi);
-
- C_SUB(*Fout1,sc5,sc6);
- C_ADD(*Fout4,sc5,sc6);
-
- sc11.r = sc0.r + S_MUL(sc7.r,ybr) + S_MUL(sc8.r,yar);
- sc11.i = sc0.i + S_MUL(sc7.i,ybr) + S_MUL(sc8.i,yar);
- sc12.r = - S_MUL(sc10.i,ybi) + S_MUL(sc9.i,yai);
- sc12.i = S_MUL(sc10.r,ybi) - S_MUL(sc9.r,yai);
- C_ADD(*Fout2,sc11,sc12);
- C_SUB(*Fout3,sc11,sc12);
-
- ++Fout1; ++Fout2; ++Fout3; ++Fout4;
- }
- }
- #endif
- #endif
|