| #include "rar.hpp" |
| |
| // We used "Screaming Fast Galois Field Arithmetic Using Intel SIMD |
| // Instructions" paper by James S. Plank, Kevin M. Greenan |
| // and Ethan L. Miller for fast SSE based multiplication. |
| // Also we are grateful to Artem Drobanov and Bulat Ziganshin |
| // for samples and ideas allowed to make Reed-Solomon codec more efficient. |
| |
| RSCoder16::RSCoder16() |
| { |
| Decoding=false; |
| ND=NR=NE=0; |
| ValidFlags=NULL; |
| MX=NULL; |
| DataLog=NULL; |
| DataLogSize=0; |
| |
| gfInit(); |
| } |
| |
| |
| RSCoder16::~RSCoder16() |
| { |
| delete[] gfExp; |
| delete[] gfLog; |
| delete[] DataLog; |
| delete[] MX; |
| delete[] ValidFlags; |
| } |
| |
| |
| // Initialize logarithms and exponents Galois field tables. |
| void RSCoder16::gfInit() |
| { |
| gfExp=new uint[4*gfSize+1]; |
| gfLog=new uint[gfSize+1]; |
| |
| for (uint L=0,E=1;L<gfSize;L++) |
| { |
| gfLog[E]=L; |
| gfExp[L]=E; |
| gfExp[L+gfSize]=E; // Duplicate the table to avoid gfExp overflow check. |
| E<<=1; |
| if (E>gfSize) |
| E^=0x1100B; // Irreducible field-generator polynomial. |
| } |
| |
| // log(0)+log(x) must be outside of usual log table, so we can set it |
| // to 0 and avoid check for 0 in multiplication parameters. |
| gfLog[0]= 2*gfSize; |
| for (uint I=2*gfSize;I<=4*gfSize;I++) // Results for log(0)+log(x). |
| gfExp[I]=0; |
| } |
| |
| |
| uint RSCoder16::gfAdd(uint a,uint b) // Addition in Galois field. |
| { |
| return a^b; |
| } |
| |
| |
| uint RSCoder16::gfMul(uint a,uint b) // Multiplication in Galois field. |
| { |
| return gfExp[gfLog[a]+gfLog[b]]; |
| } |
| |
| |
| uint RSCoder16::gfInv(uint a) // Inverse element in Galois field. |
| { |
| return a==0 ? 0:gfExp[gfSize-gfLog[a]]; |
| } |
| |
| |
| bool RSCoder16::Init(uint DataCount, uint RecCount, bool *ValidityFlags) |
| { |
| ND = DataCount; |
| NR = RecCount; |
| NE = 0; |
| |
| Decoding=ValidityFlags!=NULL; |
| if (Decoding) |
| { |
| delete[] ValidFlags; |
| ValidFlags=new bool[ND + NR]; |
| |
| for (uint I = 0; I < ND + NR; I++) |
| ValidFlags[I]=ValidityFlags[I]; |
| for (uint I = 0; I < ND; I++) |
| if (!ValidFlags[I]) |
| NE++; |
| uint ValidECC=0; |
| for (uint I = ND; I < ND + NR; I++) |
| if (ValidFlags[I]) |
| ValidECC++; |
| if (NE > ValidECC || NE == 0 || ValidECC == 0) |
| return false; |
| } |
| if (ND + NR > gfSize || NR > ND || ND == 0 || NR == 0) |
| return false; |
| |
| delete[] MX; |
| if (Decoding) |
| { |
| MX=new uint[NE * ND]; |
| MakeDecoderMatrix(); |
| InvertDecoderMatrix(); |
| } |
| else |
| { |
| MX=new uint[NR * ND]; |
| MakeEncoderMatrix(); |
| } |
| return true; |
| } |
| |
| |
| void RSCoder16::MakeEncoderMatrix() |
| { |
| // Create Cauchy encoder generator matrix. Skip trivial "1" diagonal rows, |
| // which would just copy source data to destination. |
| for (uint I = 0; I < NR; I++) |
| for (uint J = 0; J < ND; J++) |
| MX[I * ND + J] = gfInv( gfAdd( (I+ND), J) ); |
| } |
| |
| |
| void RSCoder16::MakeDecoderMatrix() |
| { |
| // Create Cauchy decoder matrix. Skip trivial rows matching valid data |
| // units and containing "1" on main diagonal. Such rows would just copy |
| // source data to destination and they have no real value for us. |
| // Include rows only for broken data units and replace them by first |
| // available valid recovery code rows. |
| for (uint Flag=0, R=ND, Dest=0; Flag < ND; Flag++) |
| if (!ValidFlags[Flag]) // For every broken data unit. |
| { |
| while (!ValidFlags[R]) // Find a valid recovery unit. |
| R++; |
| for (uint J = 0; J < ND; J++) // And place its row to matrix. |
| MX[Dest*ND + J] = gfInv( gfAdd(R,J) ); |
| Dest++; |
| R++; |
| } |
| } |
| |
| |
| // Apply Gauss-Jordan elimination to find inverse of decoder matrix. |
| // We have the square NDxND matrix, but we do not store its trivial |
| // diagonal "1" rows matching valid data, so we work with NExND matrix. |
| // Our original Cauchy matrix does not contain 0, so we skip search |
| // for non-zero pivot. |
| void RSCoder16::InvertDecoderMatrix() |
| { |
| uint *MI=new uint[NE * ND]; // We'll create inverse matrix here. |
| memset(MI, 0, ND * NE * sizeof(*MI)); // Initialize to identity matrix. |
| for (uint Kr = 0, Kf = 0; Kr < NE; Kr++, Kf++) |
| { |
| while (ValidFlags[Kf]) // Skip trivial rows. |
| Kf++; |
| MI[Kr * ND + Kf] = 1; // Set diagonal 1. |
| } |
| |
| // Kr is the number of row in our actual reduced NE x ND matrix, |
| // which does not contain trivial diagonal 1 rows. |
| // Kf is the number of row in full ND x ND matrix with all trivial rows |
| // included. |
| for (uint Kr = 0, Kf = 0; Kf < ND; Kr++, Kf++) // Select pivot row. |
| { |
| while (ValidFlags[Kf] && Kf < ND) |
| { |
| // Here we process trivial diagonal 1 rows matching valid data units. |
| // Their processing can be simplified comparing to usual rows. |
| // In full version of elimination we would set MX[I * ND + Kf] to zero |
| // after MI[..]^=, but we do not need it for matrix inversion. |
| for (uint I = 0; I < NE; I++) |
| MI[I * ND + Kf] ^= MX[I * ND + Kf]; |
| Kf++; |
| } |
| |
| if (Kf == ND) |
| break; |
| |
| uint *MXk = MX + Kr * ND; // k-th row of main matrix. |
| uint *MIk = MI + Kr * ND; // k-th row of inversion matrix. |
| |
| uint PInv = gfInv( MXk[Kf] ); // Pivot inverse. |
| // Divide the pivot row by pivot, so pivot cell contains 1. |
| for (uint I = 0; I < ND; I++) |
| { |
| MXk[I] = gfMul( MXk[I], PInv ); |
| MIk[I] = gfMul( MIk[I], PInv ); |
| } |
| |
| for (uint I = 0; I < NE; I++) |
| if (I != Kr) // For all rows except containing the pivot cell. |
| { |
| // Apply Gaussian elimination Mij -= Mkj * Mik / pivot. |
| // Since pivot is already 1, it is reduced to Mij -= Mkj * Mik. |
| uint *MXi = MX + I * ND; // i-th row of main matrix. |
| uint *MIi = MI + I * ND; // i-th row of inversion matrix. |
| uint Mik = MXi[Kf]; // Cell in pivot position. |
| for (uint J = 0; J < ND; J++) |
| { |
| MXi[J] ^= gfMul(MXk[J] , Mik); |
| MIi[J] ^= gfMul(MIk[J] , Mik); |
| } |
| } |
| } |
| |
| // Copy data to main matrix. |
| for (uint I = 0; I < NE * ND; I++) |
| MX[I] = MI[I]; |
| |
| delete[] MI; |
| } |
| |
| |
| #if 0 |
| // Multiply matrix to data vector. When encoding, it contains data in Data |
| // and stores error correction codes in Out. When decoding it contains |
| // broken data followed by ECC in Data and stores recovered data to Out. |
| // We do not use this function now, everything is moved to UpdateECC. |
| void RSCoder16::Process(const uint *Data, uint *Out) |
| { |
| uint ProcData[gfSize]; |
| |
| for (uint I = 0; I < ND; I++) |
| ProcData[I]=Data[I]; |
| |
| if (Decoding) |
| { |
| // Replace broken data units with first available valid recovery codes. |
| // 'Data' array must contain recovery codes after data. |
| for (uint I=0, R=ND, Dest=0; I < ND; I++) |
| if (!ValidFlags[I]) // For every broken data unit. |
| { |
| while (!ValidFlags[R]) // Find a valid recovery unit. |
| R++; |
| ProcData[I]=Data[R]; |
| R++; |
| } |
| } |
| |
| uint H=Decoding ? NE : NR; |
| for (uint I = 0; I < H; I++) |
| { |
| uint R = 0; // Result of matrix row multiplication to data. |
| |
| uint *MXi=MX + I * ND; |
| for (uint J = 0; J < ND; J++) |
| R ^= gfMul(MXi[J], ProcData[J]); |
| |
| Out[I] = R; |
| } |
| } |
| #endif |
| |
| |
| // We update ECC in blocks by applying every data block to all ECC blocks. |
| // This function applies one data block to one ECC block. |
| void RSCoder16::UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize) |
| { |
| if (DataNum==0) // Init ECC data. |
| memset(ECC, 0, BlockSize); |
| |
| bool DirectAccess; |
| #ifdef LITTLE_ENDIAN |
| // We can access data and ECC directly if we have little endian 16 bit uint. |
| DirectAccess=sizeof(ushort)==2; |
| #else |
| DirectAccess=false; |
| #endif |
| |
| #ifdef USE_SSE |
| if (DirectAccess && SSE_UpdateECC(DataNum,ECCNum,Data,ECC,BlockSize)) |
| return; |
| #endif |
| |
| if (ECCNum==0) |
| { |
| if (DataLogSize!=BlockSize) |
| { |
| delete[] DataLog; |
| DataLog=new uint[BlockSize]; |
| DataLogSize=BlockSize; |
| |
| } |
| if (DirectAccess) |
| for (size_t I=0; I<BlockSize; I+=2) |
| DataLog[I] = gfLog[ *(ushort*)(Data+I) ]; |
| else |
| for (size_t I=0; I<BlockSize; I+=2) |
| { |
| uint D=Data[I]+Data[I+1]*256; |
| DataLog[I] = gfLog[ D ]; |
| } |
| } |
| |
| uint ML = gfLog[ MX[ECCNum * ND + DataNum] ]; |
| |
| if (DirectAccess) |
| for (size_t I=0; I<BlockSize; I+=2) |
| *(ushort*)(ECC+I) ^= gfExp[ ML + DataLog[I] ]; |
| else |
| for (size_t I=0; I<BlockSize; I+=2) |
| { |
| uint R=gfExp[ ML + DataLog[I] ]; |
| ECC[I]^=byte(R); |
| ECC[I+1]^=byte(R/256); |
| } |
| } |
| |
| |
| #ifdef USE_SSE |
| // Data and ECC addresses must be properly aligned for SSE. |
| // AVX2 did not provide a noticeable speed gain on i7-6700K here. |
| bool RSCoder16::SSE_UpdateECC(uint DataNum, uint ECCNum, const byte *Data, byte *ECC, size_t BlockSize) |
| { |
| // Check data alignment and SSSE3 support. |
| if ((size_t(Data) & (SSE_ALIGNMENT-1))!=0 || (size_t(ECC) & (SSE_ALIGNMENT-1))!=0 || |
| _SSE_Version<SSE_SSSE3) |
| return false; |
| |
| uint M=MX[ECCNum * ND + DataNum]; |
| |
| // Prepare tables containing products of M and 4, 8, 12, 16 bit length |
| // numbers, which have 4 high bits in 0..15 range and other bits set to 0. |
| // Store high and low bytes of resulting 16 bit product in separate tables. |
| __m128i T0L,T1L,T2L,T3L; // Low byte tables. |
| __m128i T0H,T1H,T2H,T3H; // High byte tables. |
| |
| for (uint I=0;I<16;I++) |
| { |
| ((byte *)&T0L)[I]=gfMul(I,M); |
| ((byte *)&T0H)[I]=gfMul(I,M)>>8; |
| ((byte *)&T1L)[I]=gfMul(I<<4,M); |
| ((byte *)&T1H)[I]=gfMul(I<<4,M)>>8; |
| ((byte *)&T2L)[I]=gfMul(I<<8,M); |
| ((byte *)&T2H)[I]=gfMul(I<<8,M)>>8; |
| ((byte *)&T3L)[I]=gfMul(I<<12,M); |
| ((byte *)&T3H)[I]=gfMul(I<<12,M)>>8; |
| } |
| |
| size_t Pos=0; |
| |
| __m128i LowByteMask=_mm_set1_epi16(0xff); // 00ff00ff...00ff |
| __m128i Low4Mask=_mm_set1_epi8(0xf); // 0f0f0f0f...0f0f |
| __m128i High4Mask=_mm_slli_epi16(Low4Mask,4); // f0f0f0f0...f0f0 |
| |
| for (; Pos+2*sizeof(__m128i)<=BlockSize; Pos+=2*sizeof(__m128i)) |
| { |
| // We process two 128 bit chunks of source data at once. |
| __m128i *D=(__m128i *)(Data+Pos); |
| |
| // Place high bytes of both chunks to one variable and low bytes to |
| // another, so we can use the table lookup multiplication for 16 values |
| // 4 bit length each at once. |
| __m128i HighBytes0=_mm_srli_epi16(D[0],8); |
| __m128i LowBytes0=_mm_and_si128(D[0],LowByteMask); |
| __m128i HighBytes1=_mm_srli_epi16(D[1],8); |
| __m128i LowBytes1=_mm_and_si128(D[1],LowByteMask); |
| __m128i HighBytes=_mm_packus_epi16(HighBytes0,HighBytes1); |
| __m128i LowBytes=_mm_packus_epi16(LowBytes0,LowBytes1); |
| |
| // Multiply bits 0..3 of low bytes. Store low and high product bytes |
| // separately in cumulative sum variables. |
| __m128i LowBytesLow4=_mm_and_si128(LowBytes,Low4Mask); |
| __m128i LowBytesMultSum=_mm_shuffle_epi8(T0L,LowBytesLow4); |
| __m128i HighBytesMultSum=_mm_shuffle_epi8(T0H,LowBytesLow4); |
| |
| // Multiply bits 4..7 of low bytes. Store low and high product bytes separately. |
| __m128i LowBytesHigh4=_mm_and_si128(LowBytes,High4Mask); |
| LowBytesHigh4=_mm_srli_epi16(LowBytesHigh4,4); |
| __m128i LowBytesHigh4MultLow=_mm_shuffle_epi8(T1L,LowBytesHigh4); |
| __m128i LowBytesHigh4MultHigh=_mm_shuffle_epi8(T1H,LowBytesHigh4); |
| |
| // Add new product to existing sum, low and high bytes separately. |
| LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,LowBytesHigh4MultLow); |
| HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,LowBytesHigh4MultHigh); |
| |
| // Multiply bits 0..3 of high bytes. Store low and high product bytes separately. |
| __m128i HighBytesLow4=_mm_and_si128(HighBytes,Low4Mask); |
| __m128i HighBytesLow4MultLow=_mm_shuffle_epi8(T2L,HighBytesLow4); |
| __m128i HighBytesLow4MultHigh=_mm_shuffle_epi8(T2H,HighBytesLow4); |
| |
| // Add new product to existing sum, low and high bytes separately. |
| LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesLow4MultLow); |
| HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesLow4MultHigh); |
| |
| // Multiply bits 4..7 of high bytes. Store low and high product bytes separately. |
| __m128i HighBytesHigh4=_mm_and_si128(HighBytes,High4Mask); |
| HighBytesHigh4=_mm_srli_epi16(HighBytesHigh4,4); |
| __m128i HighBytesHigh4MultLow=_mm_shuffle_epi8(T3L,HighBytesHigh4); |
| __m128i HighBytesHigh4MultHigh=_mm_shuffle_epi8(T3H,HighBytesHigh4); |
| |
| // Add new product to existing sum, low and high bytes separately. |
| LowBytesMultSum=_mm_xor_si128(LowBytesMultSum,HighBytesHigh4MultLow); |
| HighBytesMultSum=_mm_xor_si128(HighBytesMultSum,HighBytesHigh4MultHigh); |
| |
| // Combine separate low and high cumulative sum bytes to 16-bit words. |
| __m128i HighBytesHigh4Mult0=_mm_unpacklo_epi8(LowBytesMultSum,HighBytesMultSum); |
| __m128i HighBytesHigh4Mult1=_mm_unpackhi_epi8(LowBytesMultSum,HighBytesMultSum); |
| |
| // Add result to ECC. |
| __m128i *StoreECC=(__m128i *)(ECC+Pos); |
| |
| StoreECC[0]=_mm_xor_si128(StoreECC[0],HighBytesHigh4Mult0); |
| StoreECC[1]=_mm_xor_si128(StoreECC[1],HighBytesHigh4Mult1); |
| } |
| |
| // If we have non 128 bit aligned data in the end of block, process them |
| // in a usual way. We cannot do the same in the beginning of block, |
| // because Data and ECC can have different alignment offsets. |
| for (; Pos<BlockSize; Pos+=2) |
| *(ushort*)(ECC+Pos) ^= gfMul( M, *(ushort*)(Data+Pos) ); |
| |
| return true; |
| } |
| #endif |