diff options
Diffstat (limited to 'doc/snow.txt')
-rw-r--r-- | doc/snow.txt | 638 |
1 files changed, 638 insertions, 0 deletions
diff --git a/doc/snow.txt b/doc/snow.txt new file mode 100644 index 0000000000..080a33b38f --- /dev/null +++ b/doc/snow.txt @@ -0,0 +1,638 @@ +============================================= +Snow Video Codec Specification Draft 20080110 +============================================= + +Introduction: +============= +This specification describes the Snow bitstream syntax and semantics as +well as the formal Snow decoding process. + +The decoding process is described precisely and any compliant decoder +MUST produce the exact same output for a spec-conformant Snow stream. +For encoding, though, any process which generates a stream compliant to +the syntactical and semantic requirements and which is decodable by +the process described in this spec shall be considered a conformant +Snow encoder. + +Definitions: +============ + +MUST the specific part must be done to conform to this standard +SHOULD it is recommended to be done that way, but not strictly required + +ilog2(x) is the rounded down logarithm of x with basis 2 +ilog2(0) = 0 + +Type definitions: +================= + +b 1-bit range coded +u unsigned scalar value range coded +s signed scalar value range coded + + +Bitstream syntax: +================= + +frame: + header + prediction + residual + +header: + keyframe b MID_STATE + if(keyframe || always_reset) + reset_contexts + if(keyframe){ + version u header_state + always_reset b header_state + temporal_decomposition_type u header_state + temporal_decomposition_count u header_state + spatial_decomposition_count u header_state + colorspace_type u header_state + if (nb_planes > 2) { + chroma_h_shift u header_state + chroma_v_shift u header_state + } + spatial_scalability b header_state + max_ref_frames-1 u header_state + qlogs + } + if(!keyframe){ + update_mc b header_state + if(update_mc){ + for(plane=0; plane<nb_plane_types; plane++){ + diag_mc b header_state + htaps/2-1 u header_state + for(i= p->htaps/2; i; i--) + |hcoeff[i]| u header_state + } + } + update_qlogs b header_state + if(update_qlogs){ + spatial_decomposition_count u header_state + qlogs + } + } + + spatial_decomposition_type s header_state + qlog s header_state + mv_scale s header_state + qbias s header_state + block_max_depth s header_state + +qlogs: + for(plane=0; plane<nb_plane_types; plane++){ + quant_table[plane][0][0] s header_state + for(level=0; level < spatial_decomposition_count; level++){ + quant_table[plane][level][1]s header_state + quant_table[plane][level][3]s header_state + } + } + +reset_contexts + *_state[*]= MID_STATE + +prediction: + for(y=0; y<block_count_vertical; y++) + for(x=0; x<block_count_horizontal; x++) + block(0) + +block(level): + mvx_diff=mvy_diff=y_diff=cb_diff=cr_diff=0 + if(keyframe){ + intra=1 + }else{ + if(level!=max_block_depth){ + s_context= 2*left->level + 2*top->level + topleft->level + topright->level + leaf b block_state[4 + s_context] + } + if(level==max_block_depth || leaf){ + intra b block_state[1 + left->intra + top->intra] + if(intra){ + y_diff s block_state[32] + cb_diff s block_state[64] + cr_diff s block_state[96] + }else{ + ref_context= ilog2(2*left->ref) + ilog2(2*top->ref) + if(ref_frames > 1) + ref u block_state[128 + 1024 + 32*ref_context] + mx_context= ilog2(2*abs(left->mx - top->mx)) + my_context= ilog2(2*abs(left->my - top->my)) + mvx_diff s block_state[128 + 32*(mx_context + 16*!!ref)] + mvy_diff s block_state[128 + 32*(my_context + 16*!!ref)] + } + }else{ + block(level+1) + block(level+1) + block(level+1) + block(level+1) + } + } + + +residual: + residual2(luma) + if (nb_planes > 2) { + residual2(chroma_cr) + residual2(chroma_cb) + } + +residual2: + for(level=0; level<spatial_decomposition_count; level++){ + if(level==0) + subband(LL, 0) + subband(HL, level) + subband(LH, level) + subband(HH, level) + } + +subband: + FIXME + +nb_plane_types = gray ? 1 : 2; + +Tag description: +---------------- + +version + 0 + this MUST NOT change within a bitstream + +always_reset + if 1 then the range coder contexts will be reset after each frame + +temporal_decomposition_type + 0 + +temporal_decomposition_count + 0 + +spatial_decomposition_count + FIXME + +colorspace_type + 0 unspecified YcbCr + 1 Gray + 2 Gray + Alpha + 3 GBR + 4 GBRA + this MUST NOT change within a bitstream + +chroma_h_shift + log2(luma.width / chroma.width) + this MUST NOT change within a bitstream + +chroma_v_shift + log2(luma.height / chroma.height) + this MUST NOT change within a bitstream + +spatial_scalability + 0 + +max_ref_frames + maximum number of reference frames + this MUST NOT change within a bitstream + +update_mc + indicates that motion compensation filter parameters are stored in the + header + +diag_mc + flag to enable faster diagonal interpolation + this SHOULD be 1 unless it turns out to be covered by a valid patent + +htaps + number of half pel interpolation filter taps, MUST be even, >0 and <10 + +hcoeff + half pel interpolation filter coefficients, hcoeff[0] are the 2 middle + coefficients [1] are the next outer ones and so on, resulting in a filter + like: ...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... + the sign of the coefficients is not explicitly stored but alternates + after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... + hcoeff[0] is not explicitly stored but found by subtracting the sum + of all stored coefficients with signs from 32 + hcoeff[0]= 32 - hcoeff[1] - hcoeff[2] - ... + a good choice for hcoeff and htaps is + htaps= 6 + hcoeff={40,-10,2} + an alternative which requires more computations at both encoder and + decoder side and may or may not be better is + htaps= 8 + hcoeff={42,-14,6,-2} + + +ref_frames + minimum of the number of available reference frames and max_ref_frames + for example the first frame after a key frame always has ref_frames=1 + +spatial_decomposition_type + wavelet type + 0 is a 9/7 symmetric compact integer wavelet + 1 is a 5/3 symmetric compact integer wavelet + others are reserved + stored as delta from last, last is reset to 0 if always_reset || keyframe + +qlog + quality (logarthmic quantizer scale) + stored as delta from last, last is reset to 0 if always_reset || keyframe + +mv_scale + stored as delta from last, last is reset to 0 if always_reset || keyframe + FIXME check that everything works fine if this changes between frames + +qbias + dequantization bias + stored as delta from last, last is reset to 0 if always_reset || keyframe + +block_max_depth + maximum depth of the block tree + stored as delta from last, last is reset to 0 if always_reset || keyframe + +quant_table + quantiztation table + + +Highlevel bitstream structure: +============================= + -------------------------------------------- +| Header | + -------------------------------------------- +| ------------------------------------ | +| | Block0 | | +| | split? | | +| | yes no | | +| | ......... intra? | | +| | : Block01 : yes no | | +| | : Block02 : ....... .......... | | +| | : Block03 : : y DC : : ref index: | | +| | : Block04 : : cb DC : : motion x : | | +| | ......... : cr DC : : motion y : | | +| | ....... .......... | | +| ------------------------------------ | +| ------------------------------------ | +| | Block1 | | +| ... | + -------------------------------------------- +| ------------ ------------ ------------ | +|| Y subbands | | Cb subbands| | Cr subbands|| +|| --- --- | | --- --- | | --- --- || +|| |LL0||HL0| | | |LL0||HL0| | | |LL0||HL0| || +|| --- --- | | --- --- | | --- --- || +|| --- --- | | --- --- | | --- --- || +|| |LH0||HH0| | | |LH0||HH0| | | |LH0||HH0| || +|| --- --- | | --- --- | | --- --- || +|| --- --- | | --- --- | | --- --- || +|| |HL1||LH1| | | |HL1||LH1| | | |HL1||LH1| || +|| --- --- | | --- --- | | --- --- || +|| --- --- | | --- --- | | --- --- || +|| |HH1||HL2| | | |HH1||HL2| | | |HH1||HL2| || +|| ... | | ... | | ... || +| ------------ ------------ ------------ | + -------------------------------------------- + +Decoding process: +================= + + ------------ + | | + | Subbands | + ------------ | | + | | ------------ + | Intra DC | | + | | LL0 subband prediction + ------------ | + \ Dequantizaton + ------------------- \ | +| Reference frames | \ IDWT +| ------- ------- | Motion \ | +||Frame 0| |Frame 1|| Compensation . OBMC v ------- +| ------- ------- | --------------. \------> + --->|Frame n|-->output +| ------- ------- | ------- +||Frame 2| |Frame 3||<----------------------------------/ +| ... | + ------------------- + + +Range Coder: +============ + +Binary Range Coder: +------------------- +The implemented range coder is an adapted version based upon "Range encoding: +an algorithm for removing redundancy from a digitised message." by G. N. N. +Martin. +The symbols encoded by the Snow range coder are bits (0|1). The +associated probabilities are not fix but change depending on the symbol mix +seen so far. + + +bit seen | new state +---------+----------------------------------------------- + 0 | 256 - state_transition_table[256 - old_state]; + 1 | state_transition_table[ old_state]; + +state_transition_table = { + 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, + 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, + 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, + 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, +104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, +119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, +134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, +150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, +165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, +180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, +195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, +210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, +226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, +241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0}; + +FIXME + + +Range Coding of integers: +------------------------- +FIXME + + +Neighboring Blocks: +=================== +left and top are set to the respective blocks unless they are outside of +the image in which case they are set to the Null block + +top-left is set to the top left block unless it is outside of the image in +which case it is set to the left block + +if this block has no larger parent block or it is at the left side of its +parent block and the top right block is not outside of the image then the +top right block is used for top-right else the top-left block is used + +Null block +y,cb,cr are 128 +level, ref, mx and my are 0 + + +Motion Vector Prediction: +========================= +1. the motion vectors of all the neighboring blocks are scaled to +compensate for the difference of reference frames + +scaled_mv= (mv * (256 * (current_reference+1) / (mv.reference+1)) + 128)>>8 + +2. the median of the scaled left, top and top-right vectors is used as +motion vector prediction + +3. the used motion vector is the sum of the predictor and + (mvx_diff, mvy_diff)*mv_scale + + +Intra DC Predicton: +====================== +the luma and chroma values of the left block are used as predictors + +the used luma and chroma is the sum of the predictor and y_diff, cb_diff, cr_diff +to reverse this in the decoder apply the following: +block[y][x].dc[0] = block[y][x-1].dc[0] + y_diff; +block[y][x].dc[1] = block[y][x-1].dc[1] + cb_diff; +block[y][x].dc[2] = block[y][x-1].dc[2] + cr_diff; +block[*][-1].dc[*]= 128; + + +Motion Compensation: +==================== + +Halfpel interpolation: +---------------------- +halfpel interpolation is done by convolution with the halfpel filter stored +in the header: + +horizontal halfpel samples are found by +H1[y][x] = hcoeff[0]*(F[y][x ] + F[y][x+1]) + + hcoeff[1]*(F[y][x-1] + F[y][x+2]) + + hcoeff[2]*(F[y][x-2] + F[y][x+3]) + + ... +h1[y][x] = (H1[y][x] + 32)>>6; + +vertical halfpel samples are found by +H2[y][x] = hcoeff[0]*(F[y ][x] + F[y+1][x]) + + hcoeff[1]*(F[y-1][x] + F[y+2][x]) + + ... +h2[y][x] = (H2[y][x] + 32)>>6; + +vertical+horizontal halfpel samples are found by +H3[y][x] = hcoeff[0]*(H2[y][x ] + H2[y][x+1]) + + hcoeff[1]*(H2[y][x-1] + H2[y][x+2]) + + ... +H3[y][x] = hcoeff[0]*(H1[y ][x] + H1[y+1][x]) + + hcoeff[1]*(H1[y+1][x] + H1[y+2][x]) + + ... +h3[y][x] = (H3[y][x] + 2048)>>12; + + + F H1 F + | | | + | | | + | | | + F H1 F + | | | + | | | + | | | + F-------F-------F-> H1<-F-------F-------F + v v v + H2 H3 H2 + ^ ^ ^ + F-------F-------F-> H1<-F-------F-------F + | | | + | | | + | | | + F H1 F + | | | + | | | + | | | + F H1 F + + +unavailable fullpel samples (outside the picture for example) shall be equal +to the closest available fullpel sample + + +Smaller pel interpolation: +-------------------------- +if diag_mc is set then points which lie on a line between 2 vertically, +horiziontally or diagonally adjacent halfpel points shall be interpolated +linearls with rounding to nearest and halfway values rounded up. +points which lie on 2 diagonals at the same time should only use the one +diagonal not containing the fullpel point + + + + F-->O---q---O<--h1->O---q---O<--F + v \ / v \ / v + O O O O O O O + | / | \ | + q q q q q + | / | \ | + O O O O O O O + ^ / \ ^ / \ ^ + h2-->O---q---O<--h3->O---q---O<--h2 + v \ / v \ / v + O O O O O O O + | \ | / | + q q q q q + | \ | / | + O O O O O O O + ^ / \ ^ / \ ^ + F-->O---q---O<--h1->O---q---O<--F + + + +the remaining points shall be bilinearly interpolated from the +up to 4 surrounding halfpel and fullpel points, again rounding should be to +nearest and halfway values rounded up + +compliant Snow decoders MUST support 1-1/8 pel luma and 1/2-1/16 pel chroma +interpolation at least + + +Overlapped block motion compensation: +------------------------------------- +FIXME + +LL band prediction: +=================== +Each sample in the LL0 subband is predicted by the median of the left, top and +left+top-topleft samples, samples outside the subband shall be considered to +be 0. To reverse this prediction in the decoder apply the following. +for(y=0; y<height; y++){ + for(x=0; x<width; x++){ + sample[y][x] += median(sample[y-1][x], + sample[y][x-1], + sample[y-1][x]+sample[y][x-1]-sample[y-1][x-1]); + } +} +sample[-1][*]=sample[*][-1]= 0; +width,height here are the width and height of the LL0 subband not of the final +video + + +Dequantizaton: +============== +FIXME + +Wavelet Transform: +================== + +Snow supports 2 wavelet transforms, the symmetric biorthogonal 5/3 integer +transform and a integer approximation of the symmetric biorthogonal 9/7 +daubechies wavelet. + +2D IDWT (inverse discrete wavelet transform) +-------------------------------------------- +The 2D IDWT applies a 2D filter recursively, each time combining the +4 lowest frequency subbands into a single subband until only 1 subband +remains. +The 2D filter is done by first applying a 1D filter in the vertical direction +and then applying it in the horizontal one. + --------------- --------------- --------------- --------------- +|LL0|HL0| | | | | | | | | | | | +|---+---| HL1 | | L0|H0 | HL1 | | LL1 | HL1 | | | | +|LH0|HH0| | | | | | | | | | | | +|-------+-------|->|-------+-------|->|-------+-------|->| L1 | H1 |->... +| | | | | | | | | | | | +| LH1 | HH1 | | LH1 | HH1 | | LH1 | HH1 | | | | +| | | | | | | | | | | | + --------------- --------------- --------------- --------------- + + +1D Filter: +---------- +1. interleave the samples of the low and high frequency subbands like +s={L0, H0, L1, H1, L2, H2, L3, H3, ... } +note, this can end with a L or a H, the number of elements shall be w +s[-1] shall be considered equivalent to s[1 ] +s[w ] shall be considered equivalent to s[w-2] + +2. perform the lifting steps in order as described below + +5/3 Integer filter: +1. s[i] -= (s[i-1] + s[i+1] + 2)>>2; for all even i < w +2. s[i] += (s[i-1] + s[i+1] )>>1; for all odd i < w + +\ | /|\ | /|\ | /|\ | /|\ + \|/ | \|/ | \|/ | \|/ | + + | + | + | + | -1/4 + /|\ | /|\ | /|\ | /|\ | +/ | \|/ | \|/ | \|/ | \|/ + | + | + | + | + +1/2 + + +Snow's 9/7 Integer filter: +1. s[i] -= (3*(s[i-1] + s[i+1]) + 4)>>3; for all even i < w +2. s[i] -= s[i-1] + s[i+1] ; for all odd i < w +3. s[i] += ( s[i-1] + s[i+1] + 4*s[i] + 8)>>4; for all even i < w +4. s[i] += (3*(s[i-1] + s[i+1]) )>>1; for all odd i < w + +\ | /|\ | /|\ | /|\ | /|\ + \|/ | \|/ | \|/ | \|/ | + + | + | + | + | -3/8 + /|\ | /|\ | /|\ | /|\ | +/ | \|/ | \|/ | \|/ | \|/ + (| + (| + (| + (| + -1 +\ + /|\ + /|\ + /|\ + /|\ +1/4 + \|/ | \|/ | \|/ | \|/ | + + | + | + | + | +1/16 + /|\ | /|\ | /|\ | /|\ | +/ | \|/ | \|/ | \|/ | \|/ + | + | + | + | + +3/2 + +optimization tips: +following are exactly identical +(3a)>>1 == a + (a>>1) +(a + 4b + 8)>>4 == ((a>>2) + b + 2)>>2 + +16bit implementation note: +The IDWT can be implemented with 16bits, but this requires some care to +prevent overflows, the following list, lists the minimum number of bits needed +for some terms +1. lifting step +A= s[i-1] + s[i+1] 16bit +3*A + 4 18bit +A + (A>>1) + 2 17bit + +3. lifting step +s[i-1] + s[i+1] 17bit + +4. lifiting step +3*(s[i-1] + s[i+1]) 17bit + + +TODO: +===== +Important: +finetune initial contexts +flip wavelet? +try to use the wavelet transformed predicted image (motion compensated image) as context for coding the residual coefficients +try the MV length as context for coding the residual coefficients +use extradata for stuff which is in the keyframes now? +the MV median predictor is patented IIRC +implement per picture halfpel interpolation +try different range coder state transition tables for different contexts + +Not Important: +compare the 6 tap and 8 tap hpel filters (psnr/bitrate and subjective quality) +spatial_scalability b vs u (!= 0 breaks syntax anyway so we can add a u later) + + +Credits: +======== +Michael Niedermayer +Loren Merritt + + +Copyright: +========== +GPL + GFDL + whatever is needed to make this a RFC |