AAC encoder: Extensive improvements

This finalizes merging of the work in the patches in ticket #2686. Improvements to twoloop and RC logic are extensive. The non-exhaustive list of twoloop improvments includes: - Tweaks to distortion limits on the RD optimization phase of twoloop - Deeper search in twoloop - PNS information marking to let twoloop decide when to use it (turned out having the decision made separately wasn't working) - Tonal band detection and priorization - Better band energy conservation rules - Strict hole avoidance For rate control: - Use psymodel's bit allocation to allow proper use of the bit reservoir. Don't work against the bit reservoir by moving lambda in the opposite direction when psymodel decides to allocate more/less bits to a frame. - Retry the encode if the effective rate lies outside a reasonable margin of psymodel's allocation or the selected ABR. - Log average lambda at the end. Useful info for everyone, but especially for tuning of the various encoder constants that relate to lambda feedback. Psy: - Do not apply lowpass with a FIR filter, instead just let the coder zero bands above the cutoff. The FIR filter induces group delay, and while zeroing bands causes ripple, it's lost in the quantization noise. - Experimental VBR bit allocation code - Tweak automatic lowpass filter threshold to maximize audio bandwidth at all bitrates while still providing acceptable, stable quality. I/S: - Phase decision fixes. Unrelated to #2686, but the bugs only surfaced when the merge was finalized. Measure I/S band energy accounting for phase, and prevent I/S and M/S from being applied both. PNS: - Avoid marking short bands with PNS when they're part of a window group in which there's a large variation of energy from one window to the next. PNS can't preserve those and the effect is extremely noticeable. M/S: - Implement BMLD protection similar to the specified in ISO-IEC/13818:7-2003, Appendix C Section 6.1. Since M/S decision doesn't conform to section 6.1, a different method had to be implemented, but should provide equivalent protection. - Move the decision logic closer to the method specified in ISO-IEC/13818:7-2003, Appendix C Section 6.1. Specifically, make sure M/S needs less bits than dual stereo. - Don't apply M/S in bands that are using I/S Now, this of course needed adjustments in the compare targets and fuzz factors of the AAC encoder's fate tests, but if wondering why the targets go up (more distortion), consider the previous coder was using too many bits on LF content (far more than required by psy), and thus those signals will now be more distorted, not less. The extra distortion isn't audible though, I carried extensive ABX testing to make sure. A very similar patch was also extensively tested by Kamendo2 in the context of #2686.
2015-10-11 17:29:50 -03:00 · 2015-10-11 17:29:50 -03:00 · 01ecb7172b
commit 01ecb7172b
parent 624057df3f
18 changed files with 1277 additions and 262 deletions
--- a/1
+++ b/1
@ -18,6 +18,7 @@ version <next>:
 - ffplay dynamic volume control
 - displace filter
 - selectivecolor filter
+- extensive native AAC encoder improvements


 version 2.8:
--- a/libavcodec/aac.h
+++ b/libavcodec/aac.h
@ -252,6 +252,7 @@ typedef struct SingleChannelElement {
    INTFLOAT sf[120];                               ///< scalefactors
    int sf_idx[128];                                ///< scalefactor indices (used by encoder)
    uint8_t zeroes[128];                            ///< band is not coded (used by encoder)
+    uint8_t can_pns[128];                           ///< band is allowed to PNS (informative)
    float  is_ener[128];                            ///< Intensity stereo pos (used by encoder)
    float pns_ener[128];                            ///< Noise energy values (used by encoder)
    DECLARE_ALIGNED(32, INTFLOAT, pcoeffs)[1024];   ///< coefficients for IMDCT, pristine
--- a/libavcodec/aaccoder.c
+++ b/libavcodec/aaccoder.c
@ -33,7 +33,9 @@
 #include "libavutil/libm.h" // brought forward to work around cygwin header breakage

 #include <float.h>
+
 #include "libavutil/mathematics.h"
+#include "mathops.h"
 #include "avcodec.h"
 #include "put_bits.h"
 #include "aac.h"
@ -50,9 +52,6 @@

 #include "libavcodec/aaccoder_twoloop.h"

-/** Frequency in Hz for lower limit of noise substitution **/
-#define NOISE_LOW_LIMIT 4000
-
 /* Parameter of f(x) = a*(lambda/100), defines the maximum fourier spread
 * beyond which no PNS is used (since the SFBs contain tone rather than noise) */
 #define NOISE_SPREAD_THRESHOLD 0.5073f
@ -124,7 +123,7 @@ static void encode_window_bands_info(AACEncContext *s, SingleChannelElement *sce
                    rd += quantize_band_cost(s, &sce->coeffs[start + w*128],
                                             &s->scoefs[start + w*128], size,
                                             sce->sf_idx[(win+w)*16+swb], aac_cb_out_map[cb],
-                                             lambda / band->threshold, INFINITY, NULL, 0);
+                                             lambda / band->threshold, INFINITY, NULL, NULL, 0);
                }
                cost_stay_here = path[swb][cb].cost + rd;
                cost_get_here  = minrd              + rd + run_bits + 4;
@ -335,7 +334,7 @@ static void search_for_quantizers_anmr(AVCodecContext *avctx, AACEncContext *s,
                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                        FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                        dist += quantize_band_cost(s, coefs + w2*128, s->scoefs + start + w2*128, sce->ics.swb_sizes[g],
-                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL, 0);
+                                                   q + q0, cb, lambda / band->threshold, INFINITY, NULL, NULL, 0);
                    }
                    minrd = FFMIN(minrd, dist);

@ -499,7 +498,7 @@ static void search_for_quantizers_faac(AVCodecContext *avctx, AACEncContext *s,
                                               ESC_BT,
                                               lambda,
                                               INFINITY,
-                                               &b,
+                                               &b, NULL,
                                               0);
                    dist -= b;
                }
@ -588,12 +587,36 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
 {
    FFPsyBand *band;
    int w, g, w2, i;
+    int wlen = 1024 / sce->ics.num_windows;
+    int bandwidth, cutoff;
    float *PNS = &s->scoefs[0*128], *PNS34 = &s->scoefs[1*128];
    float *NOR34 = &s->scoefs[3*128];
    const float lambda = s->lambda;
-    const float freq_mult = avctx->sample_rate/(1024.0f/sce->ics.num_windows)/2.0f;
+    const float freq_mult = avctx->sample_rate*0.5f/wlen;
    const float thr_mult = NOISE_LAMBDA_REPLACE*(100.0f/lambda);
-    const float spread_threshold = NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f);
+    const float spread_threshold = FFMIN(0.75f, NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f));
+    const float dist_bias = av_clipf(4.f * 120 / lambda, 0.25f, 4.0f);
+    const float pns_transient_energy_r = FFMIN(0.7f, lambda / 140.f);
+
+    int refbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+
+    /** Keep this in sync with twoloop's cutoff selection */
+    float rate_bandwidth_multiplier = 1.5f;
+    int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+        ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+        : (avctx->bit_rate / avctx->channels);
+
+    frame_bit_rate *= 1.15f;
+
+    if (avctx->cutoff > 0) {
+        bandwidth = avctx->cutoff;
+    } else {
+        bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+    }
+
+    cutoff = bandwidth * 2 * wlen / avctx->sample_rate;

    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
@ -602,32 +625,44 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
            int noise_sfi;
            float dist1 = 0.0f, dist2 = 0.0f, noise_amp;
            float pns_energy = 0.0f, pns_tgt_energy, energy_ratio, dist_thresh;
-            float sfb_energy = 0.0f, threshold = 0.0f, spread = 0.0f;
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 2.0f;
+            float min_energy = -1.0f, max_energy = 0.0f;
            const int start = wstart+sce->ics.swb_offset[g];
            const float freq = (start-wstart)*freq_mult;
            const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
-            if (freq < NOISE_LOW_LIMIT || avctx->cutoff && freq >= avctx->cutoff)
+            if (freq < NOISE_LOW_LIMIT || (start-wstart) >= cutoff)
                continue;
            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
                sfb_energy += band->energy;
-                spread     += band->spread;
+                spread     = FFMIN(spread, band->spread);
                threshold  += band->threshold;
+                if (!w2) {
+                    min_energy = max_energy = band->energy;
+                } else {
+                    min_energy = FFMIN(min_energy, band->energy);
+                    max_energy = FFMAX(max_energy, band->energy);
+                }
            }

            /* Ramps down at ~8000Hz and loosens the dist threshold */
-            dist_thresh = FFMIN(2.5f*NOISE_LOW_LIMIT/freq, 2.5f);
+            dist_thresh = av_clipf(2.5f*NOISE_LOW_LIMIT/freq, 0.5f, 2.5f) * dist_bias;

-            /* zero and energy close to threshold usually means hole avoidance,
-             * we do want to remain avoiding holes with PNS
+            /* PNS is acceptable when all of these are true:
+             * 1. high spread energy (noise-like band)
+             * 2. near-threshold energy (high PE means the random nature of PNS content will be noticed)
+             * 3. on short window groups, all windows have similar energy (variations in energy would be destroyed by PNS)
+             *
+             * At this stage, point 2 is relaxed for zeroed bands near the noise threshold (hole avoidance is more important)
             */
            if (((sce->zeroes[w*16+g] || !sce->band_alt[w*16+g]) && sfb_energy < threshold*sqrtf(1.5f/freq_boost)) || spread < spread_threshold ||
-                (sce->band_alt[w*16+g] && sfb_energy > threshold*thr_mult*freq_boost)) {
+                (!sce->zeroes[w*16+g] && sce->band_alt[w*16+g] && sfb_energy > threshold*thr_mult*freq_boost) ||
+                min_energy < pns_transient_energy_r * max_energy ) {
                sce->pns_ener[w*16+g] = sfb_energy;
                continue;
            }

-            pns_tgt_energy = sfb_energy*spread*spread/sce->ics.group_len[w];
+            pns_tgt_energy = sfb_energy*FFMIN(1.0f, spread*spread);
            noise_sfi = av_clip(roundf(log2f(pns_tgt_energy)*2), -100, 155); /* Quantize */
            noise_amp = -ff_aac_pow2sf_tab[noise_sfi + POW_SF2_ZERO];    /* Dequantize */
            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
@ -648,13 +683,18 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
                                            sce->ics.swb_sizes[g],
                                            sce->sf_idx[(w+w2)*16+g],
                                            sce->band_alt[(w+w2)*16+g],
-                                            lambda/band->threshold, INFINITY, NULL, 0);
-                /* Estimate rd on average as 9 bits for CB and sf + spread energy * lambda/thr */
-                dist2 += 9+band->energy/(band->spread*band->spread)*lambda/band->threshold;
+                                            lambda/band->threshold, INFINITY, NULL, NULL, 0);
+                /* Estimate rd on average as 5 bits for SF, 4 for the CB, plus spread energy * lambda/thr */
+                dist2 += band->energy/(band->spread*band->spread)*lambda*dist_thresh/band->threshold;
+            }
+            if (g && sce->sf_idx[(w+w2)*16+g-1] == NOISE_BT) {
+                dist2 += 5;
+            } else {
+                dist2 += 9;
            }
            energy_ratio = pns_tgt_energy/pns_energy; /* Compensates for quantization error */
            sce->pns_ener[w*16+g] = energy_ratio*pns_tgt_energy;
-            if (energy_ratio > 0.85f && energy_ratio < 1.25f && (sce->zeroes[w*16+g] || !sce->band_alt[w*16+g] || dist2*dist_thresh < dist1)) {
+            if (sce->zeroes[w*16+g] || !sce->band_alt[w*16+g] || (energy_ratio > 0.85f && energy_ratio < 1.25f && dist2 < dist1)) {
                sce->band_type[w*16+g] = NOISE_BT;
                sce->zeroes[w*16+g] = 0;
            }
@ -662,62 +702,203 @@ static void search_for_pns(AACEncContext *s, AVCodecContext *avctx, SingleChanne
    }
 }

+static void mark_pns(AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce)
+{
+    FFPsyBand *band;
+    int w, g, w2;
+    int wlen = 1024 / sce->ics.num_windows;
+    int bandwidth, cutoff;
+    const float lambda = s->lambda;
+    const float freq_mult = avctx->sample_rate*0.5f/wlen;
+    const float spread_threshold = FFMIN(0.75f, NOISE_SPREAD_THRESHOLD*FFMAX(0.5f, lambda/100.f));
+    const float pns_transient_energy_r = FFMIN(0.7f, lambda / 140.f);
+
+    int refbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+
+    /** Keep this in sync with twoloop's cutoff selection */
+    float rate_bandwidth_multiplier = 1.5f;
+    int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+        ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+        : (avctx->bit_rate / avctx->channels);
+
+    frame_bit_rate *= 1.15f;
+
+    if (avctx->cutoff > 0) {
+        bandwidth = avctx->cutoff;
+    } else {
+        bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+    }
+
+    cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+
+    memcpy(sce->band_alt, sce->band_type, sizeof(sce->band_type));
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            float sfb_energy = 0.0f, threshold = 0.0f, spread = 2.0f;
+            float min_energy = -1.0f, max_energy = 0.0f;
+            const int start = sce->ics.swb_offset[g];
+            const float freq = start*freq_mult;
+            const float freq_boost = FFMAX(0.88f*freq/NOISE_LOW_LIMIT, 1.0f);
+            if (freq < NOISE_LOW_LIMIT || start >= cutoff) {
+                sce->can_pns[w*16+g] = 0;
+                continue;
+            }
+            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                sfb_energy += band->energy;
+                spread     = FFMIN(spread, band->spread);
+                threshold  += band->threshold;
+                if (!w2) {
+                    min_energy = max_energy = band->energy;
+                } else {
+                    min_energy = FFMIN(min_energy, band->energy);
+                    max_energy = FFMAX(max_energy, band->energy);
+                }
+            }
+
+            /* PNS is acceptable when all of these are true:
+             * 1. high spread energy (noise-like band)
+             * 2. near-threshold energy (high PE means the random nature of PNS content will be noticed)
+             * 3. on short window groups, all windows have similar energy (variations in energy would be destroyed by PNS)
+             */
+            sce->pns_ener[w*16+g] = sfb_energy;
+            if (sfb_energy < threshold*sqrtf(1.5f/freq_boost) || spread < spread_threshold || min_energy < pns_transient_energy_r * max_energy) {
+                sce->can_pns[w*16+g] = 0;
+            } else {
+                sce->can_pns[w*16+g] = 1;
+            }
+        }
+    }
+}
+
 static void search_for_ms(AACEncContext *s, ChannelElement *cpe)
 {
-    int start = 0, i, w, w2, g;
+    int start = 0, i, w, w2, g, sid_sf_boost;
    float M[128], S[128];
    float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
    const float lambda = s->lambda;
+    const float mslambda = FFMIN(1.0f, lambda / 120.f);
    SingleChannelElement *sce0 = &cpe->ch[0];
    SingleChannelElement *sce1 = &cpe->ch[1];
    if (!cpe->common_window)
        return;
    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
+        int min_sf_idx_mid = SCALE_MAX_POS;
+        int min_sf_idx_side = SCALE_MAX_POS;
+        for (g = 0; g < sce0->ics.num_swb; g++) {
+            if (!sce0->zeroes[w*16+g] && sce0->band_type[w*16+g] < RESERVED_BT)
+                min_sf_idx_mid = FFMIN(min_sf_idx_mid, sce0->sf_idx[w*16+g]);
+            if (!sce1->zeroes[w*16+g] && sce1->band_type[w*16+g] < RESERVED_BT)
+                min_sf_idx_side = FFMIN(min_sf_idx_side, sce1->sf_idx[w*16+g]);
+        }
+
        start = 0;
        for (g = 0;  g < sce0->ics.num_swb; g++) {
+            float bmax = bval2bmax(g * 17.0f / sce0->ics.num_swb) / 0.0045f;
+            cpe->ms_mask[w*16+g] = 0;
            if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
-                float dist1 = 0.0f, dist2 = 0.0f;
+                float Mmax = 0.0f, Smax = 0.0f;
+
+                /* Must compute mid/side SF and book for the whole window group */
                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
-                    FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
-                    FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
-                    float minthr = FFMIN(band0->threshold, band1->threshold);
-                    float maxthr = FFMAX(band0->threshold, band1->threshold);
                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
                        M[i] = (sce0->coeffs[start+(w+w2)*128+i]
                              + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
                        S[i] =  M[i]
                              - sce1->coeffs[start+(w+w2)*128+i];
                    }
-                    abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
-                    abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
-                    dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
-                                                L34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / band0->threshold, INFINITY, NULL, 0);
-                    dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
-                                                R34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / band1->threshold, INFINITY, NULL, 0);
-                    dist2 += quantize_band_cost(s, M,
-                                                M34,
-                                                sce0->ics.swb_sizes[g],
-                                                sce0->sf_idx[(w+w2)*16+g],
-                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / maxthr, INFINITY, NULL, 0);
-                    dist2 += quantize_band_cost(s, S,
-                                                S34,
-                                                sce1->ics.swb_sizes[g],
-                                                sce1->sf_idx[(w+w2)*16+g],
-                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / minthr, INFINITY, NULL, 0);
+                    abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]);
+                    abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]);
+                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++ ) {
+                        Mmax = FFMAX(Mmax, M34[i]);
+                        Smax = FFMAX(Smax, S34[i]);
+                    }
+                }
+
+                for (sid_sf_boost = 0; sid_sf_boost < 4; sid_sf_boost++) {
+                    float dist1 = 0.0f, dist2 = 0.0f;
+                    int B0 = 0, B1 = 0;
+                    int minidx;
+                    int mididx, sididx;
+                    int midcb, sidcb;
+
+                    minidx = FFMIN(sce0->sf_idx[w*16+g], sce1->sf_idx[w*16+g]);
+                    mididx = av_clip(minidx, min_sf_idx_mid, min_sf_idx_mid + SCALE_MAX_DIFF);
+                    sididx = av_clip(minidx - sid_sf_boost * 3, min_sf_idx_side, min_sf_idx_side + SCALE_MAX_DIFF);
+                    midcb = find_min_book(Mmax, mididx);
+                    sidcb = find_min_book(Smax, sididx);
+
+                    if ((mididx > minidx) || (sididx > minidx)) {
+                        /* scalefactor range violation, bad stuff, will decrease quality unacceptably */
+                        continue;
+                    }
+
+                    /* No CB can be zero */
+                    midcb = FFMAX(1,midcb);
+                    sidcb = FFMAX(1,sidcb);
+
+                    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
+                        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
+                        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
+                        float minthr = FFMIN(band0->threshold, band1->threshold);
+                        int b1,b2,b3,b4;
+                        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
+                            M[i] = (sce0->coeffs[start+(w+w2)*128+i]
+                                  + sce1->coeffs[start+(w+w2)*128+i]) * 0.5;
+                            S[i] =  M[i]
+                                  - sce1->coeffs[start+(w+w2)*128+i];
+                        }
+
+                        abs_pow34_v(L34, sce0->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(R34, sce1->coeffs+start+(w+w2)*128, sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
+                        abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
+                        dist1 += quantize_band_cost(s, &sce0->coeffs[start + (w+w2)*128],
+                                                    L34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[(w+w2)*16+g],
+                                                    sce0->band_type[(w+w2)*16+g],
+                                                    lambda / band0->threshold, INFINITY, &b1, NULL, 0);
+                        dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
+                                                    R34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[(w+w2)*16+g],
+                                                    sce1->band_type[(w+w2)*16+g],
+                                                    lambda / band1->threshold, INFINITY, &b2, NULL, 0);
+                        dist2 += quantize_band_cost(s, M,
+                                                    M34,
+                                                    sce0->ics.swb_sizes[g],
+                                                    sce0->sf_idx[(w+w2)*16+g],
+                                                    sce0->band_type[(w+w2)*16+g],
+                                                    lambda / minthr, INFINITY, &b3, NULL, 0);
+                        dist2 += quantize_band_cost(s, S,
+                                                    S34,
+                                                    sce1->ics.swb_sizes[g],
+                                                    sce1->sf_idx[(w+w2)*16+g],
+                                                    sce1->band_type[(w+w2)*16+g],
+                                                    mslambda / (minthr * bmax), INFINITY, &b4, NULL, 0);
+                        B0 += b1+b2;
+                        B1 += b3+b4;
+                        dist1 -= B0;
+                        dist2 -= B1;
+                    }
+                    cpe->ms_mask[w*16+g] = dist2 <= dist1 && B1 < B0;
+                    if (cpe->ms_mask[w*16+g]) {
+                        /* Setting the M/S mask is useful with I/S, but only the flag */
+                        if (!cpe->is_mask[w*16+g]) {
+                            sce0->sf_idx[w*16+g] = mididx;
+                            sce1->sf_idx[w*16+g] = sididx;
+                            sce0->band_type[w*16+g] = midcb;
+                            sce1->band_type[w*16+g] = sidcb;
+                        }
+                        break;
+                    } else if (B1 > B0) {
+                        /* More boost won't fix this */
+                        break;
+                    }
                }
-                cpe->ms_mask[w*16+g] = dist2 < dist1;
            }
            start += sce0->ics.swb_sizes[g];
        }
@ -736,6 +917,7 @@ AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
        ff_aac_apply_tns,
        set_special_band_scalefactors,
        search_for_pns,
+        mark_pns,
        ff_aac_search_for_tns,
        search_for_ms,
        ff_aac_search_for_is,
@ -752,6 +934,7 @@ AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
        ff_aac_apply_tns,
        set_special_band_scalefactors,
        search_for_pns,
+        mark_pns,
        ff_aac_search_for_tns,
        search_for_ms,
        ff_aac_search_for_is,
@ -768,6 +951,7 @@ AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
        ff_aac_apply_tns,
        set_special_band_scalefactors,
        search_for_pns,
+        mark_pns,
        ff_aac_search_for_tns,
        search_for_ms,
        ff_aac_search_for_is,
@ -784,6 +968,7 @@ AACCoefficientsEncoder ff_aac_coders[AAC_CODER_NB] = {
        ff_aac_apply_tns,
        set_special_band_scalefactors,
        search_for_pns,
+        mark_pns,
        ff_aac_search_for_tns,
        search_for_ms,
        ff_aac_search_for_is,
--- a/libavcodec/aaccoder_trellis.h
+++ b/libavcodec/aaccoder_trellis.h
@ -129,7 +129,7 @@ static void codebook_trellis_rate(AACEncContext *s, SingleChannelElement *sce,
                                               &s->scoefs[start + w*128], size,
                                               sce->sf_idx[win*16+swb],
                                               aac_cb_out_map[cb],
-                                               0, INFINITY, NULL, 0);
+                                               0, INFINITY, NULL, NULL, 0);
                }
                cost_stay_here = path[swb][cb].cost + bits;
                cost_get_here  = minbits            + bits + run_bits + 4;
--- a/libavcodec/aaccoder_twoloop.h
+++ b/libavcodec/aaccoder_twoloop.h
@ -22,7 +22,7 @@
 /**
 * @file
 * AAC encoder twoloop coder
- * @author Konstantin Shishkov
+ * @author Konstantin Shishkov, Claudio Freire
 */

 /**
@ -34,6 +34,7 @@
 *  - abs_pow34_v
 *  - find_max_val
 *  - find_min_book
+ *  - find_form_factor
 */

 #ifndef AVCODEC_AACCODER_TWOLOOP_H
@ -41,6 +42,7 @@

 #include <float.h>
 #include "libavutil/mathematics.h"
+#include "mathops.h"
 #include "avcodec.h"
 #include "put_bits.h"
 #include "aac.h"
@ -49,6 +51,20 @@
 #include "aacenctab.h"
 #include "aac_tablegen_decl.h"

+/** Frequency in Hz for lower limit of noise substitution **/
+#define NOISE_LOW_LIMIT 4000
+
+#define sclip(x) av_clip(x,60,218)
+
+
+static av_always_inline int ff_pns_bits(const SingleChannelElement *sce, int w, int g)
+{
+    if (!g || !sce->zeroes[w*16+g-1] || !sce->can_pns[w*16+g-1]) {
+        return 9;
+    } else {
+        return 5;
+    }
+}

 /**
 * two-loop quantizers search taken from ISO 13818-7 Appendix C
@ -58,51 +74,219 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                                          SingleChannelElement *sce,
                                          const float lambda)
 {
-    int start = 0, i, w, w2, g;
-    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels * (lambda / 120.f);
-    float dists[128] = { 0 }, uplims[128] = { 0 };
-    float maxvals[128];
-    int fflag, minscaler;
-    int its  = 0;
-    int allz = 0;
-    float minthr = INFINITY;
+    int start = 0, i, w, w2, g, recomprd;
+    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate
+        / ((avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : avctx->channels)
+        * (lambda / 120.f);
+    int refbits = destbits;
+    int toomanybits, toofewbits;
+    char nzs[128];
+    int maxsf[128];
+    float dists[128] = { 0 }, qenergies[128] = { 0 }, uplims[128], euplims[128], energies[128];
+    float maxvals[128], spread_thr_r[128];
+    float min_spread_thr_r, max_spread_thr_r;

-    // for values above this the decoder might end up in an endless loop
-    // due to always having more bits than what can be encoded.
+    /**
+     * rdlambda controls the maximum tolerated distortion. Twoloop
+     * will keep iterating until it fails to lower it or it reaches
+     * ulimit * rdlambda. Keeping it low increases quality on difficult
+     * signals, but lower it too much, and bits will be taken from weak
+     * signals, creating "holes". A balance is necesary.
+     * rdmax and rdmin specify the relative deviation from rdlambda
+     * allowed for tonality compensation
+     */
+    float rdlambda = av_clipf(2.0f * 120.f / lambda, 0.0625f, 16.0f);
+    const float nzslope = 1.5f;
+    float rdmin = 0.03125f;
+    float rdmax = 1.0f;
+
+    /**
+     * sfoffs controls an offset of optmium allocation that will be
+     * applied based on lambda. Keep it real and modest, the loop
+     * will take care of the rest, this just accelerates convergence
+     */
+    float sfoffs = av_clipf(log2f(120.0f / lambda) * 4.0f, -5, 10);
+
+    int fflag, minscaler, maxscaler, nminscaler, minrdsf;
+    int its  = 0;
+    int maxits = 30;
+    int allz = 0;
+    int tbits;
+    int cutoff = 1024;
+    int pns_start_pos;
+
+    /**
+     * zeroscale controls a multiplier of the threshold, if band energy
+     * is below this, a zero is forced. Keep it lower than 1, unless
+     * low lambda is used, because energy < threshold doesn't mean there's
+     * no audible signal outright, it's just energy. Also make it rise
+     * slower than rdlambda, as rdscale has due compensation with
+     * noisy band depriorization below, whereas zeroing logic is rather dumb
+     */
+    float zeroscale;
+    if (lambda > 120.f) {
+        zeroscale = av_clipf(powf(120.f / lambda, 0.25f), 0.0625f, 1.0f);
+    } else {
+        zeroscale = 1.f;
+    }
+
+    if (s->psy.bitres.alloc >= 0) {
+        /**
+         * Psy granted us extra bits to use, from the reservoire
+         * adjust for lambda except what psy already did
+         */
+        destbits = s->psy.bitres.alloc
+            * (lambda / (avctx->global_quality ? avctx->global_quality : 120));
+    }
+
+    if (avctx->flags & CODEC_FLAG_QSCALE) {
+        /**
+         * Constant Q-scale doesn't compensate MS coding on its own
+         * No need to be overly precise, this only controls RD
+         * adjustment CB limits when going overboard
+         */
+        if (s->options.stereo_mode && s->cur_type == TYPE_CPE)
+            destbits *= 2;
+
+        /**
+         * When using a constant Q-scale, don't adjust bits, just use RD
+         * Don't let it go overboard, though... 8x psy target is enough
+         */
+        toomanybits = 5800;
+        toofewbits = destbits / 16;
+
+        /** Don't offset scalers, just RD */
+        sfoffs = sce->ics.num_windows - 1;
+        rdlambda = sqrtf(rdlambda);
+
+        /** search further */
+        maxits *= 2;
+    } else {
+        /** When using ABR, be strict */
+        toomanybits = destbits + destbits/16;
+        toofewbits = destbits - destbits/4;
+
+        sfoffs = 0;
+        rdlambda = sqrtf(rdlambda);
+    }
+
+    /** and zero out above cutoff frequency */
+    {
+        int wlen = 1024 / sce->ics.num_windows;
+        int bandwidth;
+
+        /**
+         * Scale, psy gives us constant quality, this LP only scales
+         * bitrate by lambda, so we save bits on subjectively unimportant HF
+         * rather than increase quantization noise. Adjust nominal bitrate
+         * to effective bitrate according to encoding parameters,
+         * AAC_CUTOFF_FROM_BITRATE is calibrated for effective bitrate.
+         */
+        float rate_bandwidth_multiplier = 1.5f;
+        int frame_bit_rate = (avctx->flags & CODEC_FLAG_QSCALE)
+            ? (refbits * rate_bandwidth_multiplier * avctx->sample_rate / 1024)
+            : (avctx->bit_rate / avctx->channels);
+
+        /** Compensate for extensions that increase efficiency */
+        if (s->options.pns || s->options.intensity_stereo)
+            frame_bit_rate *= 1.15f;
+
+        if (avctx->cutoff > 0) {
+            bandwidth = avctx->cutoff;
+        } else {
+            bandwidth = FFMAX(3000, AAC_CUTOFF_FROM_BITRATE(frame_bit_rate, 1, avctx->sample_rate));
+        }
+
+        cutoff = bandwidth * 2 * wlen / avctx->sample_rate;
+        pns_start_pos = NOISE_LOW_LIMIT * 2 * wlen / avctx->sample_rate;
+    }
+
+    /**
+     * for values above this the decoder might end up in an endless loop
+     * due to always having more bits than what can be encoded.
+     */
    destbits = FFMIN(destbits, 5800);
-    //XXX: some heuristic to determine initial quantizers will reduce search time
-    //determine zero bands and upper limits
+    toomanybits = FFMIN(toomanybits, 5800);
+    toofewbits = FFMIN(toofewbits, 5800);
+    /**
+     * XXX: some heuristic to determine initial quantizers will reduce search time
+     * determine zero bands and upper distortion limits
+     */
+    min_spread_thr_r = -1;
+    max_spread_thr_r = -1;
    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
-        for (g = 0;  g < sce->ics.num_swb; g++) {
+        for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
            int nz = 0;
-            float uplim = 0.0f, energy = 0.0f;
+            float uplim = 0.0f, energy = 0.0f, spread = 0.0f;
            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
-                uplim  += band->threshold;
-                energy += band->energy;
-                if (band->energy <= band->threshold || band->threshold == 0.0f) {
+                if (start >= cutoff || band->energy <= (band->threshold * zeroscale) || band->threshold == 0.0f) {
                    sce->zeroes[(w+w2)*16+g] = 1;
                    continue;
                }
                nz = 1;
            }
-            uplims[w*16+g] = uplim *512;
+            if (!nz) {
+                uplim = 0.0f;
+            } else {
+                nz = 0;
+                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                    FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
+                    if (band->energy <= (band->threshold * zeroscale) || band->threshold == 0.0f)
+                        continue;
+                    uplim += band->threshold;
+                    energy += band->energy;
+                    spread += band->spread;
+                    nz++;
+                }
+            }
+            uplims[w*16+g] = uplim;
+            energies[w*16+g] = energy;
+            nzs[w*16+g] = nz;
            sce->zeroes[w*16+g] = !nz;
-            if (nz)
-                minthr = FFMIN(minthr, uplim);
            allz |= nz;
+            if (nz) {
+                spread_thr_r[w*16+g] = energy * nz / (uplim * spread);
+                if (min_spread_thr_r < 0) {
+                    min_spread_thr_r = max_spread_thr_r = spread_thr_r[w*16+g];
+                } else {
+                    min_spread_thr_r = FFMIN(min_spread_thr_r, spread_thr_r[w*16+g]);
+                    max_spread_thr_r = FFMAX(max_spread_thr_r, spread_thr_r[w*16+g]);
+                }
+            }
        }
    }
+
+    /** Compute initial scalers */
+    minscaler = 65535;
    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
        for (g = 0;  g < sce->ics.num_swb; g++) {
            if (sce->zeroes[w*16+g]) {
                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
                continue;
            }
-            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
+            /**
+             * log2f-to-distortion ratio is, technically, 2 (1.5db = 4, but it's power vs level so it's 2).
+             * But, as offsets are applied, low-frequency signals are too sensitive to the induced distortion,
+             * so we make scaling more conservative by choosing a lower log2f-to-distortion ratio, and thus
+             * more robust.
+             */
+            sce->sf_idx[w*16+g] = av_clip(
+                SCALE_ONE_POS
+                    + 1.75*log2f(FFMAX(0.00125f,uplims[w*16+g]) / sce->ics.swb_sizes[g])
+                    + sfoffs,
+                60, SCALE_MAX_POS);
+            minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
        }
    }

+    /** Clip */
+    minscaler = av_clip(minscaler, SCALE_ONE_POS - SCALE_DIV_512, SCALE_MAX_POS - SCALE_DIV_512);
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w])
+        for (g = 0;  g < sce->ics.num_swb; g++)
+            if (!sce->zeroes[w*16+g])
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF - 1);
+
    if (!allz)
        return;
    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
@ -116,15 +300,66 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
        }
    }

+    /**
+     * Scale uplims to match rate distortion to quality
+     * bu applying noisy band depriorization and tonal band priorization.
+     * Maxval-energy ratio gives us an idea of how noisy/tonal the band is.
+     * If maxval^2 ~ energy, then that band is mostly noise, and we can relax
+     * rate distortion requirements.
+     */
+    memcpy(euplims, uplims, sizeof(euplims));
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        /** psy already priorizes transients to some extent */
+        float de_psy_factor = (sce->ics.num_windows > 1) ? 8.0f / sce->ics.group_len[w] : 1.0f;
+        start = w*128;
+        for (g = 0;  g < sce->ics.num_swb; g++) {
+            if (nzs[g] > 0) {
+                float cleanup_factor = ff_sqrf(av_clipf(start / (cutoff * 0.75f), 1.0f, 2.0f));
+                float energy2uplim = find_form_factor(
+                    sce->ics.group_len[w], sce->ics.swb_sizes[g],
+                    uplims[w*16+g] / (nzs[g] * sce->ics.swb_sizes[w]),
+                    sce->coeffs + start,
+                    nzslope * cleanup_factor);
+                energy2uplim *= de_psy_factor;
+                if (!(avctx->flags & CODEC_FLAG_QSCALE)) {
+                    /** In ABR, we need to priorize less and let rate control do its thing */
+                    energy2uplim = sqrtf(energy2uplim);
+                }
+                energy2uplim = FFMAX(0.015625f, FFMIN(1.0f, energy2uplim));
+                uplims[w*16+g] *= av_clipf(rdlambda * energy2uplim, rdmin, rdmax)
+                                  * sce->ics.group_len[w];
+
+                energy2uplim = find_form_factor(
+                    sce->ics.group_len[w], sce->ics.swb_sizes[g],
+                    uplims[w*16+g] / (nzs[g] * sce->ics.swb_sizes[w]),
+                    sce->coeffs + start,
+                    2.0f);
+                energy2uplim *= de_psy_factor;
+                if (!(avctx->flags & CODEC_FLAG_QSCALE)) {
+                    /** In ABR, we need to priorize less and let rate control do its thing */
+                    energy2uplim = sqrtf(energy2uplim);
+                }
+                energy2uplim = FFMAX(0.015625f, FFMIN(1.0f, energy2uplim));
+                euplims[w*16+g] *= av_clipf(rdlambda * energy2uplim * sce->ics.group_len[w],
+                    0.5f, 1.0f);
+            }
+            start += sce->ics.swb_sizes[g];
+        }
+    }
+
+    for (i = 0; i < sizeof(maxsf) / sizeof(maxsf[0]); ++i)
+        maxsf[i] = SCALE_MAX_POS;
+
    //perform two-loop search
    //outer loop - improve quality
    do {
-        int tbits, qstep;
-        minscaler = sce->sf_idx[0];
        //inner loop - quantize spectrum to fit into given number of bits
-        qstep = its ? 1 : 32;
+        int overdist;
+        int qstep = its ? 1 : 32;
        do {
            int prev = -1;
+            int changed = 0;
+            recomprd = 0;
            tbits = 0;
            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
                start = w*128;
@ -134,15 +369,20 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                    int bits = 0;
                    int cb;
                    float dist = 0.0f;
+                    float qenergy = 0.0f;

                    if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
                        start += sce->ics.swb_sizes[g];
+                        if (sce->can_pns[w*16+g]) {
+                            /** PNS isn't free */
+                            tbits += ff_pns_bits(sce, w, g);
+                        }
                        continue;
                    }
-                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
                    cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
                    for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
                        int b;
+                        float sqenergy;
                        dist += quantize_band_cost(s, coefs + w2*128,
                                                   scaled + w2*128,
                                                   sce->ics.swb_sizes[g],
@ -150,54 +390,309 @@ static void search_for_quantizers_twoloop(AVCodecContext *avctx,
                                                   cb,
                                                   1.0f,
                                                   INFINITY,
-                                                   &b,
+                                                   &b, &sqenergy,
                                                   0);
                        bits += b;
+                        qenergy += sqenergy;
                    }
                    dists[w*16+g] = dist - bits;
+                    qenergies[w*16+g] = qenergy;
                    if (prev != -1) {
-                        bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
+                        int sfdiff = sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO;
+                        av_assert1(sfdiff >= 0 && sfdiff <= 2*SCALE_MAX_DIFF);
+                        bits += ff_aac_scalefactor_bits[sfdiff];
                    }
                    tbits += bits;
                    start += sce->ics.swb_sizes[g];
                    prev = sce->sf_idx[w*16+g];
                }
            }
-            if (tbits > destbits) {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] < 218 - qstep)
-                        sce->sf_idx[i] += qstep;
-            } else {
-                for (i = 0; i < 128; i++)
-                    if (sce->sf_idx[i] > 60 - qstep)
-                        sce->sf_idx[i] -= qstep;
+            if (tbits > toomanybits) {
+                recomprd = 1;
+                for (i = 0; i < 128; i++) {
+                    if (sce->sf_idx[i] < (SCALE_MAX_POS - SCALE_DIV_512)) {
+                        int maxsf_i = (tbits > 5800) ? SCALE_MAX_POS : maxsf[i];
+                        int new_sf = FFMIN(maxsf_i, sce->sf_idx[i] + qstep);
+                        if (new_sf != sce->sf_idx[i]) {
+                            sce->sf_idx[i] = new_sf;
+                            changed = 1;
+                        }
+                    }
+                }
+            } else if (tbits < toofewbits) {
+                recomprd = 1;
+                for (i = 0; i < 128; i++) {
+                    if (sce->sf_idx[i] > SCALE_ONE_POS) {
+                        int new_sf = FFMAX(SCALE_ONE_POS, sce->sf_idx[i] - qstep);
+                        if (new_sf != sce->sf_idx[i]) {
+                            sce->sf_idx[i] = new_sf;
+                            changed = 1;
+                        }
+                    }
+                }
            }
            qstep >>= 1;
-            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
+            if (!qstep && tbits > toomanybits && sce->sf_idx[0] < 217 && changed)
                qstep = 1;
        } while (qstep);

-        fflag = 0;
-        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
+        overdist = 1;
+        for (i = 0; i < 2 && (overdist || recomprd); ++i) {
+            if (recomprd) {
+                /** Must recompute distortion */
+                int prev = -1;
+                tbits = 0;
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                    start = w*128;
+                    for (g = 0;  g < sce->ics.num_swb; g++) {
+                        const float *coefs = sce->coeffs + start;
+                        const float *scaled = s->scoefs + start;
+                        int bits = 0;
+                        int cb;
+                        float dist = 0.0f;
+                        float qenergy = 0.0f;

+                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
+                            start += sce->ics.swb_sizes[g];
+                            if (sce->can_pns[w*16+g]) {
+                                /** PNS isn't free */
+                                tbits += ff_pns_bits(sce, w, g);
+                            }
+                            continue;
+                        }
+                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                            int b;
+                            float sqenergy;
+                            dist += quantize_band_cost(s, coefs + w2*128,
+                                                    scaled + w2*128,
+                                                    sce->ics.swb_sizes[g],
+                                                    sce->sf_idx[w*16+g],
+                                                    cb,
+                                                    1.0f,
+                                                    INFINITY,
+                                                    &b, &sqenergy,
+                                                    0);
+                            bits += b;
+                            qenergy += sqenergy;
+                        }
+                        dists[w*16+g] = dist - bits;
+                        qenergies[w*16+g] = qenergy;
+                        if (prev != -1) {
+                            int sfdiff = sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO;
+                            av_assert1(sfdiff >= 0 && sfdiff <= 2*SCALE_MAX_DIFF);
+                            bits += ff_aac_scalefactor_bits[sfdiff];
+                        }
+                        tbits += bits;
+                        start += sce->ics.swb_sizes[g];
+                        prev = sce->sf_idx[w*16+g];
+                    }
+                }
+            }
+            if (!i && s->options.pns && its > maxits/2) {
+                float maxoverdist = 0.0f;
+                overdist = recomprd = 0;
+                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                    float ovrfactor = 2.f+(maxits-its)*16.f/maxits;
+                    for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+                        if (!sce->zeroes[w*16+g] && dists[w*16+g] > uplims[w*16+g]*ovrfactor) {
+                            float ovrdist = dists[w*16+g] / FFMAX(uplims[w*16+g],euplims[w*16+g]);
+                            maxoverdist = FFMAX(maxoverdist, ovrdist);
+                            overdist++;
+                        }
+                    }
+                }
+                if (overdist) {
+                    /* We have overdistorted bands, trade for zeroes (that can be noise)
+                     * Zero the bands in the lowest 1.25% spread-energy-threshold ranking
+                     */
+                    float minspread = max_spread_thr_r;
+                    float maxspread = min_spread_thr_r;
+                    float zspread;
+                    int zeroable = 0;
+                    int zeroed = 0;
+                    int maxzeroed;
+                    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                        for (g = start = 0;  g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+                            if (start >= pns_start_pos && !sce->zeroes[w*16+g] && sce->can_pns[w*16+g]) {
+                                minspread = FFMIN(minspread, spread_thr_r[w*16+g]);
+                                maxspread = FFMAX(maxspread, spread_thr_r[w*16+g]);
+                                zeroable++;
+                            }
+                        }
+                    }
+                    zspread = (maxspread-minspread) * 0.0125f + minspread;
+                    zspread = FFMIN(maxoverdist, zspread);
+                    maxzeroed = zeroable * its / (2 * maxits);
+                    for (g = sce->ics.num_swb-1; g > 0 && zeroed < maxzeroed; g--) {
+                        if (sce->ics.swb_offset[g] < pns_start_pos)
+                            continue;
+                        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                            if (!sce->zeroes[w*16+g] && sce->can_pns[w*16+g] && spread_thr_r[w*16+g] <= zspread) {
+                                sce->zeroes[w*16+g] = 1;
+                                sce->band_type[w*16+g] = 0;
+                                zeroed++;
+                            }
+                        }
+                    }
+                    if (zeroed)
+                        recomprd = 1;
+                } else {
+                    overdist = 0;
+                }
+            }
+        }
+
+        minscaler = SCALE_MAX_POS;
+        maxscaler = 0;
        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            for (g = 0;  g < sce->ics.num_swb; g++) {
+                if (!sce->zeroes[w*16+g]) {
+                    minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
+                    maxscaler = FFMAX(maxscaler, sce->sf_idx[w*16+g]);
+                }
+            }
+        }
+
+        fflag = 0;
+        minscaler = nminscaler = av_clip(minscaler, SCALE_ONE_POS - SCALE_DIV_512, SCALE_MAX_POS - SCALE_DIV_512);
+        minrdsf = FFMAX3(60, minscaler - 1, maxscaler - SCALE_MAX_DIFF - 1);
+        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+            /** Start with big steps, end up fine-tunning */
+            int depth = (its > maxits/2) ? ((its > maxits*2/3) ? 1 : 3) : 10;
+            int edepth = depth+2;
+            float uplmax = its / (maxits*0.25f) + 1.0f;
+            uplmax *= (tbits > destbits) ? FFMIN(2.0f, tbits / (float)FFMAX(1,destbits)) : 1.0f;
+            start = w * 128;
            for (g = 0; g < sce->ics.num_swb; g++) {
                int prevsc = sce->sf_idx[w*16+g];
-                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
-                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
-                        sce->sf_idx[w*16+g]--;
-                    else //Try to make sure there is some energy in every band
-                        sce->sf_idx[w*16+g]-=2;
+                int minrdsfboost = (sce->ics.num_windows > 1) ? av_clip(g-4, -2, 0) : av_clip(g-16, -4, 0);
+                if (!sce->zeroes[w*16+g]) {
+                    const float *coefs = sce->coeffs + start;
+                    const float *scaled = s->scoefs + start;
+                    int cmb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                    if ((!cmb || dists[w*16+g] > uplims[w*16+g]) && sce->sf_idx[w*16+g] > minrdsf) {
+                        /* Try to make sure there is some energy in every nonzero band
+                         * NOTE: This algorithm must be forcibly imbalanced, pushing harder
+                         *  on holes or more distorted bands at first, otherwise there's
+                         *  no net gain (since the next iteration will offset all bands
+                         *  on the opposite direction to compensate for extra bits)
+                         */
+                        for (i = 0; i < edepth; ++i) {
+                            int cb, bits;
+                            float dist, qenergy;
+                            int mb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1);
+                            cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                            dist = qenergy = 0.f;
+                            bits = 0;
+                            if (!cb) {
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g]-1, maxsf[w*16+g]);
+                            } else if (i >= depth && dists[w*16+g] < euplims[w*16+g]) {
+                                break;
+                            }
+                            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                                int b;
+                                float sqenergy;
+                                dist += quantize_band_cost(s, coefs + w2*128,
+                                                        scaled + w2*128,
+                                                        sce->ics.swb_sizes[g],
+                                                        sce->sf_idx[w*16+g]-1,
+                                                        cb,
+                                                        1.0f,
+                                                        INFINITY,
+                                                        &b, &sqenergy,
+                                                        0);
+                                bits += b;
+                                qenergy += sqenergy;
+                            }
+                            sce->sf_idx[w*16+g]--;
+                            dists[w*16+g] = dist - bits;
+                            qenergies[w*16+g] = qenergy;
+                            if (mb && (sce->sf_idx[w*16+g] < (minrdsf+minrdsfboost) || (
+                                    (dists[w*16+g] < FFMIN(uplmax*uplims[w*16+g], euplims[w*16+g]))
+                                    && (fabsf(qenergies[w*16+g]-energies[w*16+g]) < euplims[w*16+g])
+                                ) )) {
+                                break;
+                            }
+                        }
+                    } else if (tbits > toofewbits && sce->sf_idx[w*16+g] < maxscaler
+                            && (dists[w*16+g] < FFMIN(euplims[w*16+g], uplims[w*16+g]))
+                            && (fabsf(qenergies[w*16+g]-energies[w*16+g]) < euplims[w*16+g])
+                        ) {
+                        /** Um... over target. Save bits for more important stuff. */
+                        for (i = 0; i < depth; ++i) {
+                            int cb, bits;
+                            float dist, qenergy;
+                            cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]+1);
+                            if (cb > 0) {
+                                dist = qenergy = 0.f;
+                                bits = 0;
+                                for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
+                                    int b;
+                                    float sqenergy;
+                                    dist += quantize_band_cost(s, coefs + w2*128,
+                                                            scaled + w2*128,
+                                                            sce->ics.swb_sizes[g],
+                                                            sce->sf_idx[w*16+g]+1,
+                                                            cb,
+                                                            1.0f,
+                                                            INFINITY,
+                                                            &b, &sqenergy,
+                                                            0);
+                                    bits += b;
+                                    qenergy += sqenergy;
+                                }
+                                dist -= bits;
+                                if (dist < FFMIN(euplims[w*16+g], uplims[w*16+g])) {
+                                    sce->sf_idx[w*16+g]++;
+                                    dists[w*16+g] = dist;
+                                    qenergies[w*16+g] = qenergy;
+                                } else {
+                                    break;
+                                }
+                            } else {
+                                maxsf[w*16+g] = FFMIN(sce->sf_idx[w*16+g], maxsf[w*16+g]);
+                                break;
+                            }
+                        }
+                    }
                }
-                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
-                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
+                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minrdsf, minscaler + SCALE_MAX_DIFF);
+                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], SCALE_MAX_POS - SCALE_DIV_512);
                if (sce->sf_idx[w*16+g] != prevsc)
                    fflag = 1;
+                nminscaler = FFMIN(nminscaler, sce->sf_idx[w*16+g]);
                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                start += sce->ics.swb_sizes[g];
+            }
+        }
+        if (nminscaler < minscaler) {
+            /** Drecreased some scalers below minscaler. Must re-clamp. */
+            minscaler = nminscaler;
+            for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+                for (g = 0; g < sce->ics.num_swb; g++) {
+                    sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
+                    sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                }
            }
        }
        its++;
-    } while (fflag && its < 10);
+    } while (fflag && its < maxits);
+
+    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
+        /** Make sure proper codebooks are set */
+        for (g = start = 0; g < sce->ics.num_swb; start += sce->ics.swb_sizes[g++]) {
+            if (!sce->zeroes[w*16+g]) {
+                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
+                if (sce->band_type[w*16+g] <= 0) {
+                    sce->zeroes[w*16+g] = 1;
+                    sce->band_type[w*16+g] = 0;
+                }
+            } else {
+                sce->band_type[w*16+g] = 0;
+            }
+        }
+    }
 }

 #endif /* AVCODEC_AACCODER_TWOLOOP_H */
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@ -258,6 +258,8 @@ static void apply_intensity_stereo(ChannelElement *cpe)
                    start += ics->swb_sizes[g];
                    continue;
                }
+                if (cpe->ms_mask[w*16 + g])
+                    p *= -1;
                for (i = 0; i < ics->swb_sizes[g]; i++) {
                    float sum = (cpe->ch[0].coeffs[start+i] + p*cpe->ch[1].coeffs[start+i])*scale;
                    cpe->ch[0].coeffs[start+i] = sum;
@ -279,7 +281,7 @@ static void apply_mid_side_stereo(ChannelElement *cpe)
        for (w2 =  0; w2 < ics->group_len[w]; w2++) {
            int start = (w+w2) * 128;
            for (g = 0; g < ics->num_swb; g++) {
-                if (!cpe->ms_mask[w*16 + g]) {
+                if (!cpe->ms_mask[w*16 + g] && !cpe->is_mask[w*16 + g]) {
                    start += ics->swb_sizes[g];
                    continue;
                }
@ -490,6 +492,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    ChannelElement *cpe;
    SingleChannelElement *sce;
    int i, its, ch, w, chans, tag, start_ch, ret, frame_bits;
+    int target_bits, rate_bits, too_many_bits, too_few_bits;
    int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0;
    int chan_el_counter[4];
    FFPsyWindowInfo windows[AAC_MAX_CHANNELS];
@ -583,8 +586,6 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
        return ret;
    frame_bits = its = 0;
    do {
-        int target_bits, too_many_bits, too_few_bits;
-
        init_put_bits(&s->pb, avpkt->data, avpkt->size);

        if ((avctx->frame_number & 0xFF)==1 && !(avctx->flags & AV_CODEC_FLAG_BITEXACT))
@ -618,12 +619,15 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
            s->psy.model->analyze(&s->psy, start_ch, coeffs, wi);
            if (s->psy.bitres.alloc > 0) {
                /* Lambda unused here on purpose, we need to take psy's unscaled allocation */
-                target_bits += s->psy.bitres.alloc;
+                target_bits += s->psy.bitres.alloc
+                    * (s->lambda / (avctx->global_quality ? avctx->global_quality : 120));
                s->psy.bitres.alloc /= chans;
            }
            s->cur_type = tag;
            for (ch = 0; ch < chans; ch++) {
                s->cur_channel = start_ch + ch;
+                if (s->options.pns && s->coder->mark_pns)
+                    s->coder->mark_pns(s, avctx, &cpe->ch[ch]);
                s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], s->lambda);
            }
            if (chans > 1
@ -680,8 +684,6 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
                    s->coder->search_for_ms(s, cpe);
                else if (cpe->common_window)
                    memset(cpe->ms_mask, 1, sizeof(cpe->ms_mask));
-                for (w = 0; w < 128; w++)
-                    cpe->ms_mask[w] = cpe->is_mask[w] ? 0 : cpe->ms_mask[w];
                apply_mid_side_stereo(cpe);
            }
            adjust_frame_information(cpe, chans);
@ -708,23 +710,25 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
        }

        /* rate control stuff
-         * target either the nominal bitrate, or what psy's bit reservoir says to target
-         * whichever is greatest
+         * allow between the nominal bitrate, and what psy's bit reservoir says to target
+         * but drift towards the nominal bitrate always
         */
-
        frame_bits = put_bits_count(&s->pb);
-        target_bits = FFMAX(target_bits, avctx->bit_rate * 1024 / avctx->sample_rate);
-        target_bits = FFMIN(target_bits, 6144 * s->channels - 3);
+        rate_bits = avctx->bit_rate * 1024 / avctx->sample_rate;
+        rate_bits = FFMIN(rate_bits, 6144 * s->channels - 3);
+        too_many_bits = FFMAX(target_bits, rate_bits);
+        too_many_bits = FFMIN(too_many_bits, 6144 * s->channels - 3);
+        too_few_bits = FFMIN(FFMAX(rate_bits - rate_bits/4, target_bits), too_many_bits);

        /* When using ABR, be strict (but only for increasing) */
-        too_many_bits = target_bits + target_bits/2;
-        too_few_bits = target_bits - target_bits/8;
+        too_few_bits = too_few_bits - too_few_bits/8;
+        too_many_bits = too_many_bits + too_many_bits/2;

        if (   its == 0 /* for steady-state Q-scale tracking */
            || (its < 5 && (frame_bits < too_few_bits || frame_bits > too_many_bits))
            || frame_bits >= 6144 * s->channels - 3  )
        {
-            float ratio = ((float)target_bits) / frame_bits;
+            float ratio = ((float)rate_bits) / frame_bits;

            if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) {
                /*
@ -742,7 +746,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
            s->lambda = FFMIN(s->lambda * ratio, 65536.f);

            /* Keep iterating if we must reduce and lambda is in the sky */
-            if (s->lambda < 300.f || ratio > 0.9f) {
+            if ((s->lambda < 300.f || ratio > 0.9f) && (s->lambda > 10.f || ratio < 1.1f)) {
                break;
            } else {
                if (is_mode || ms_mode || tns_mode || pred_mode) {
@ -764,6 +768,8 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket *avpkt,
    put_bits(&s->pb, 3, TYPE_END);
    flush_put_bits(&s->pb);
    avctx->frame_bits = put_bits_count(&s->pb);
+    s->lambda_sum += s->lambda;
+    s->lambda_count++;

    if (!frame)
        s->last_frame++;
@ -780,6 +786,8 @@ static av_cold int aac_encode_end(AVCodecContext *avctx)
 {
    AACEncContext *s = avctx->priv_data;

+    av_log(avctx, AV_LOG_INFO, "Qavg: %.3f\n", s->lambda_sum / s->lambda_count);
+
    ff_mdct_end(&s->mdct1024);
    ff_mdct_end(&s->mdct128);
    ff_psy_end(&s->psy);
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@ -66,6 +66,7 @@ typedef struct AACCoefficientsEncoder {
    void (*apply_tns_filt)(struct AACEncContext *s, SingleChannelElement *sce);
    void (*set_special_band_scalefactors)(struct AACEncContext *s, SingleChannelElement *sce);
    void (*search_for_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
+    void (*mark_pns)(struct AACEncContext *s, AVCodecContext *avctx, SingleChannelElement *sce);
    void (*search_for_tns)(struct AACEncContext *s, SingleChannelElement *sce);
    void (*search_for_ms)(struct AACEncContext *s, ChannelElement *cpe);
    void (*search_for_is)(struct AACEncContext *s, AVCodecContext *avctx, ChannelElement *cpe);
@ -100,6 +101,8 @@ typedef struct AACEncContext {
    int last_frame;
    int random_state;
    float lambda;
+    float lambda_sum;                            ///< sum(lambda), for Qvg reporting
+    int lambda_count;                            ///< count(lambda), for Qvg reporting
    enum RawDataBlockType cur_type;              ///< channel group type cur_channel belongs to

    AudioFrameQueue afq;
--- a/libavcodec/aacenc_is.c
+++ b/libavcodec/aacenc_is.c
@ -45,6 +45,11 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
    float dist1 = 0.0f, dist2 = 0.0f;
    struct AACISError is_error = {0};

+    if (ener01 <= 0 || ener0 <= 0) {
+        is_error.pass = 0;
+        return is_error;
+    }
+
    for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
        FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
        FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
@ -63,15 +68,15 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
                                    sce0->ics.swb_sizes[g],
                                    sce0->sf_idx[(w+w2)*16+g],
                                    sce0->band_type[(w+w2)*16+g],
-                                    s->lambda / band0->threshold, INFINITY, NULL, 0);
+                                    s->lambda / band0->threshold, INFINITY, NULL, NULL, 0);
        dist1 += quantize_band_cost(s, &R[start + (w+w2)*128], R34,
                                    sce1->ics.swb_sizes[g],
                                    sce1->sf_idx[(w+w2)*16+g],
                                    sce1->band_type[(w+w2)*16+g],
-                                    s->lambda / band1->threshold, INFINITY, NULL, 0);
+                                    s->lambda / band1->threshold, INFINITY, NULL, NULL, 0);
        dist2 += quantize_band_cost(s, IS, I34, sce0->ics.swb_sizes[g],
                                    is_sf_idx, is_band_type,
-                                    s->lambda / minthr, INFINITY, NULL, 0);
+                                    s->lambda / minthr, INFINITY, NULL, NULL, 0);
        for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
            dist_spec_err += (L34[i] - I34[i])*(L34[i] - I34[i]);
            dist_spec_err += (R34[i] - I34[i]*e01_34)*(R34[i] - I34[i]*e01_34);
@ -85,6 +90,7 @@ struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
    is_error.error = fabsf(dist1 - dist2);
    is_error.dist1 = dist1;
    is_error.dist2 = dist2;
+    is_error.ener01 = ener01;

    return is_error;
 }
@ -105,7 +111,7 @@ void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElemen
            if (start*freq_mult > INT_STEREO_LOW_LIMIT*(s->lambda/170.0f) &&
                cpe->ch[0].band_type[w*16+g] != NOISE_BT && !cpe->ch[0].zeroes[w*16+g] &&
                cpe->ch[1].band_type[w*16+g] != NOISE_BT && !cpe->ch[1].zeroes[w*16+g]) {
-                float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f;
+                float ener0 = 0.0f, ener1 = 0.0f, ener01 = 0.0f, ener01p = 0.0f;
                struct AACISError ph_err1, ph_err2, *erf;
                if (sce0->band_type[w*16+g] == NOISE_BT ||
                    sce1->band_type[w*16+g] == NOISE_BT) {
@ -114,23 +120,25 @@ void ff_aac_search_for_is(AACEncContext *s, AVCodecContext *avctx, ChannelElemen
                }
                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
                    for (i = 0; i < sce0->ics.swb_sizes[g]; i++) {
-                        float coef0 = fabsf(sce0->pcoeffs[start+(w+w2)*128+i]);
-                        float coef1 = fabsf(sce1->pcoeffs[start+(w+w2)*128+i]);
+                        float coef0 = fabsf(sce0->coeffs[start+(w+w2)*128+i]);
+                        float coef1 = fabsf(sce1->coeffs[start+(w+w2)*128+i]);
                        ener0  += coef0*coef0;
                        ener1  += coef1*coef1;
                        ener01 += (coef0 + coef1)*(coef0 + coef1);
+                        ener01p += (coef0 - coef1)*(coef0 - coef1);
                    }
                }
                ph_err1 = ff_aac_is_encoding_err(s, cpe, start, w, g,
-                                                 ener0, ener1, ener01, 0, -1);
+                                                 ener0, ener1, ener01p, 0, -1);
                ph_err2 = ff_aac_is_encoding_err(s, cpe, start, w, g,
                                                 ener0, ener1, ener01, 0, +1);
-                erf = ph_err1.error < ph_err2.error ? &ph_err1 : &ph_err2;
+                erf = (ph_err1.pass && ph_err1.error < ph_err2.error) ? &ph_err1 : &ph_err2;
                if (erf->pass) {
                    cpe->is_mask[w*16+g] = 1;
-                    cpe->ch[0].is_ener[w*16+g] = sqrt(ener0/ener01);
+                    cpe->ms_mask[w*16+g] = 0;
+                    cpe->ch[0].is_ener[w*16+g] = sqrt(ener0 / erf->ener01);
                    cpe->ch[1].is_ener[w*16+g] = ener0/ener1;
-                    cpe->ch[1].band_type[w*16+g] = erf->phase ? INTENSITY_BT : INTENSITY_BT2;
+                    cpe->ch[1].band_type[w*16+g] = (erf->phase > 0) ? INTENSITY_BT : INTENSITY_BT2;
                    count++;
                }
            }
--- a/libavcodec/aacenc_is.h
+++ b/libavcodec/aacenc_is.h
@ -39,6 +39,7 @@ struct AACISError {
    float error; /* fabs(dist1 - dist2)  */
    float dist1; /* From original coeffs */
    float dist2; /* From IS'd coeffs     */
+    float ener01;
 };

 struct AACISError ff_aac_is_encoding_err(AACEncContext *s, ChannelElement *cpe,
--- a/libavcodec/aacenc_pred.c
+++ b/libavcodec/aacenc_pred.c
@ -271,7 +271,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
        abs_pow34_v(O34, &sce->coeffs[start_coef], num_coeffs);
        dist1 = quantize_and_encode_band_cost(s, NULL, &sce->coeffs[start_coef], NULL,
                                              O34, num_coeffs, sce->sf_idx[sfb],
-                                              cb_n, s->lambda / band->threshold, INFINITY, &cost1, 0);
+                                              cb_n, s->lambda / band->threshold, INFINITY, &cost1, NULL, 0);
        cost_coeffs += cost1;

        /* Encoded coefficients - needed for #bits, band type and quant. error */
@ -284,7 +284,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
            cb_p = cb_n;
        quantize_and_encode_band_cost(s, NULL, SENT, QERR, S34, num_coeffs,
                                      sce->sf_idx[sfb], cb_p, s->lambda / band->threshold, INFINITY,
-                                      &cost2, 0);
+                                      &cost2, NULL, 0);

        /* Reconstructed coefficients - needed for distortion measurements */
        for (i = 0; i < num_coeffs; i++)
@ -296,7 +296,7 @@ void ff_aac_search_for_pred(AACEncContext *s, SingleChannelElement *sce)
            cb_p = cb_n;
        dist2 = quantize_and_encode_band_cost(s, NULL, &sce->prcoeffs[start_coef], NULL,
                                              P34, num_coeffs, sce->sf_idx[sfb],
-                                              cb_p, s->lambda / band->threshold, INFINITY, NULL, 0);
+                                              cb_p, s->lambda / band->threshold, INFINITY, NULL, NULL, 0);
        for (i = 0; i < num_coeffs; i++)
            dist_spec_err += (O34[i] - P34[i])*(O34[i] - P34[i]);
        dist_spec_err *= s->lambda / band->threshold;
--- a/libavcodec/aacenc_quantization.h
+++ b/libavcodec/aacenc_quantization.h
@ -43,7 +43,7 @@ static av_always_inline float quantize_and_encode_band_cost_template(
                                PutBitContext *pb, const float *in, float *out,
                                const float *scaled, int size, int scale_idx,
                                int cb, const float lambda, const float uplim,
-                                int *bits, int BT_ZERO, int BT_UNSIGNED,
+                                int *bits, float *energy, int BT_ZERO, int BT_UNSIGNED,
                                int BT_PAIR, int BT_ESC, int BT_NOISE, int BT_STEREO,
                                const float ROUNDING)
 {
@ -54,6 +54,7 @@ static av_always_inline float quantize_and_encode_band_cost_template(
    const float CLIPPED_ESCAPE = 165140.0f*IQ;
    int i, j;
    float cost = 0;
+    float qenergy = 0;
    const int dim = BT_PAIR ? 2 : 4;
    int resbits = 0;
    int off;
@ -63,6 +64,8 @@ static av_always_inline float quantize_and_encode_band_cost_template(
            cost += in[i]*in[i];
        if (bits)
            *bits = 0;
+        if (energy)
+            *energy = qenergy;
        if (out) {
            for (i = 0; i < size; i += dim)
                for (j = 0; j < dim; j++)
@ -113,11 +116,13 @@ static av_always_inline float quantize_and_encode_band_cost_template(
                    out[i+j] = in[i+j] >= 0 ? quantized : -quantized;
                if (vec[j] != 0.0f)
                    curbits++;
+                qenergy += quantized*quantized;
                rd += di*di;
            }
        } else {
            for (j = 0; j < dim; j++) {
                quantized = vec[j]*IQ;
+                qenergy += quantized*quantized;
                if (out)
                    out[i+j] = quantized;
                rd += (in[i+j] - quantized)*(in[i+j] - quantized);
@ -149,6 +154,8 @@ static av_always_inline float quantize_and_encode_band_cost_template(

    if (bits)
        *bits = resbits;
+    if (energy)
+        *energy = qenergy;
    return cost;
 }

@ -156,7 +163,7 @@ static inline float quantize_and_encode_band_cost_NONE(struct AACEncContext *s,
                                                const float *in, float *quant, const float *scaled,
                                                int size, int scale_idx, int cb,
                                                const float lambda, const float uplim,
-                                                int *bits) {
+                                                int *bits, float *energy) {
    av_assert0(0);
    return 0.0f;
 }
@ -167,10 +174,10 @@ static float quantize_and_encode_band_cost_ ## NAME(
                                PutBitContext *pb, const float *in, float *quant,            \
                                const float *scaled, int size, int scale_idx,                \
                                int cb, const float lambda, const float uplim,               \
-                                int *bits) {                                                 \
+                                int *bits, float *energy) {                                  \
    return quantize_and_encode_band_cost_template(                                           \
                                s, pb, in, quant, scaled, size, scale_idx,                   \
-                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits,                   \
+                                BT_ESC ? ESC_BT : cb, lambda, uplim, bits, energy,           \
                                BT_ZERO, BT_UNSIGNED, BT_PAIR, BT_ESC, BT_NOISE, BT_STEREO,  \
                                ROUNDING);                                                   \
 }
@ -190,7 +197,7 @@ static float (*const quantize_and_encode_band_cost_arr[])(
                                PutBitContext *pb, const float *in, float *quant,
                                const float *scaled, int size, int scale_idx,
                                int cb, const float lambda, const float uplim,
-                                int *bits) = {
+                                int *bits, float *energy) = {
    quantize_and_encode_band_cost_ZERO,
    quantize_and_encode_band_cost_SQUAD,
    quantize_and_encode_band_cost_SQUAD,
@ -214,7 +221,7 @@ static float (*const quantize_and_encode_band_cost_rtz_arr[])(
                                PutBitContext *pb, const float *in, float *quant,
                                const float *scaled, int size, int scale_idx,
                                int cb, const float lambda, const float uplim,
-                                int *bits) = {
+                                int *bits, float *energy) = {
    quantize_and_encode_band_cost_ZERO,
    quantize_and_encode_band_cost_SQUAD,
    quantize_and_encode_band_cost_SQUAD,
@ -235,32 +242,32 @@ static float (*const quantize_and_encode_band_cost_rtz_arr[])(

 #define quantize_and_encode_band_cost(                                  \
                                s, pb, in, quant, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits, rtz)               \
+                                lambda, uplim, bits, energy, rtz)               \
    ((rtz) ? quantize_and_encode_band_cost_rtz_arr : quantize_and_encode_band_cost_arr)[cb]( \
                                s, pb, in, quant, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)
+                                lambda, uplim, bits, energy)

 static inline float quantize_band_cost(struct AACEncContext *s, const float *in,
                                const float *scaled, int size, int scale_idx,
                                int cb, const float lambda, const float uplim,
-                                int *bits, int rtz)
+                                int *bits, float *energy, int rtz)
 {
    return quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
-                                         cb, lambda, uplim, bits, rtz);
+                                         cb, lambda, uplim, bits, energy, rtz);
 }

 static inline int quantize_band_cost_bits(struct AACEncContext *s, const float *in,
                                const float *scaled, int size, int scale_idx,
                                int cb, const float lambda, const float uplim,
-                                int *bits, int rtz)
+                                int *bits, float *energy, int rtz)
 {
-    int _bits;
+    int auxbits;
    quantize_and_encode_band_cost(s, NULL, in, NULL, scaled, size, scale_idx,
-                                         cb, 0.0f, uplim, &_bits, rtz);
+                                         cb, 0.0f, uplim, &auxbits, energy, rtz);
    if (bits) {
-        *bits = _bits;
+        *bits = auxbits;
    }
-    return _bits;
+    return auxbits;
 }

 static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitContext *pb,
@ -268,7 +275,7 @@ static inline void quantize_and_encode_band(struct AACEncContext *s, PutBitConte
                                            int cb, const float lambda, int rtz)
 {
    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
-                                  INFINITY, NULL, rtz);
+                                  INFINITY, NULL, NULL, rtz);
 }

 #endif /* AVCODEC_AACENC_QUANTIZATION_H */
--- a/libavcodec/aacenc_utils.h
+++ b/libavcodec/aacenc_utils.h
@ -96,6 +96,54 @@ static inline int find_min_book(float maxval, int sf)
    return cb;
 }

+static float find_form_factor(int group_len, int swb_size, float thresh, const float *scaled, float nzslope) {
+    const float iswb_size = 1.0f / swb_size;
+    const float iswb_sizem1 = 1.0f / (swb_size - 1);
+    const float ethresh = thresh;
+    float form = 0.0f, weight = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        float e = 0.0f, e2 = 0.0f, var = 0.0f, maxval = 0.0f;
+        float nzl = 0;
+        for (i = 0; i < swb_size; i++) {
+            float s = fabsf(scaled[w2*128+i]);
+            maxval = FFMAX(maxval, s);
+            e += s;
+            e2 += s *= s;
+            /* We really don't want a hard non-zero-line count, since
+             * even below-threshold lines do add up towards band spectral power.
+             * So, fall steeply towards zero, but smoothly
+             */
+            if (s >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s / ethresh, nzslope);
+            }
+        }
+        if (e2 > thresh) {
+            float frm;
+            e *= iswb_size;
+
+            /** compute variance */
+            for (i = 0; i < swb_size; i++) {
+                float d = fabsf(scaled[w2*128+i]) - e;
+                var += d*d;
+            }
+            var = sqrtf(var * iswb_sizem1);
+
+            e2 *= iswb_size;
+            frm = e / FFMIN(e+4*var,maxval);
+            form += e2 * sqrtf(frm) / FFMAX(0.5f,nzl);
+            weight += e2;
+        }
+    }
+    if (weight > 0) {
+        return form / weight;
+    } else {
+        return 1.0f;
+    }
+}
+
 /** Return the minimum scalefactor where the quantized coef does not clip. */
 static inline uint8_t coef2minsf(float coef)
 {
@ -125,6 +173,14 @@ static inline int quant_array_idx(const float val, const float *arr, const int n
    return index;
 }

+/**
+ * approximates exp10f(-3.0f*(0.5f + 0.5f * cosf(FFMIN(b,15.5f) / 15.5f)))
+ */
+static av_always_inline float bval2bmax(float b)
+{
+    return 0.001f + 0.0035f * (b*b*b) / (15.5f*15.5f*15.5f);
+}
+
 /*
 * linear congruential pseudorandom number generator, copied from the decoder
 */
--- a/libavcodec/aacpsy.c
+++ b/libavcodec/aacpsy.c
@ -158,6 +158,7 @@ typedef struct AacPsyContext{
    } pe;
    AacPsyCoeffs psy_coef[2][64];
    AacPsyChannel *ch;
+    float global_quality; ///< normalized global quality taken from avctx
 }AacPsyContext;

 /**
@ -300,7 +301,8 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
    float bark;
    int i, j, g, start;
    float prev, minscale, minath, minsnr, pe_min;
-    const int chan_bitrate = ctx->avctx->bit_rate / ctx->avctx->channels;
+    int chan_bitrate = ctx->avctx->bit_rate / ((ctx->avctx->flags & CODEC_FLAG_QSCALE) ? 2.0f : ctx->avctx->channels);
+
    const int bandwidth    = ctx->avctx->cutoff ? ctx->avctx->cutoff : AAC_CUTOFF(ctx->avctx);
    const float num_bark   = calc_bark((float)bandwidth);

@ -308,9 +310,15 @@ static av_cold int psy_3gpp_init(FFPsyContext *ctx) {
    if (!ctx->model_priv_data)
        return AVERROR(ENOMEM);
    pctx = (AacPsyContext*) ctx->model_priv_data;
+    pctx->global_quality = (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) * 0.01f;
+
+    if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
+        /* Use the target average bitrate to compute spread parameters */
+        chan_bitrate = (int)(chan_bitrate / 120.0 * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120));
+    }

    pctx->chan_bitrate = chan_bitrate;
-    pctx->frame_bits   = chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate;
+    pctx->frame_bits   = FFMIN(2560, chan_bitrate * AAC_BLOCK_SIZE_LONG / ctx->avctx->sample_rate);
    pctx->pe.min       =  8.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
    pctx->pe.max       = 12.0f * AAC_BLOCK_SIZE_LONG * bandwidth / (ctx->avctx->sample_rate * 2.0f);
    ctx->bitres.size   = 6144 - pctx->frame_bits;
@ -398,7 +406,7 @@ static av_unused FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx,
                                                 int channel, int prev_type)
 {
    int i, j;
-    int br               = ctx->avctx->bit_rate / ctx->avctx->channels;
+    int br               = ((AacPsyContext*)ctx->model_priv_data)->chan_bitrate;
    int attack_ratio     = br <= 16000 ? 18 : 10;
    AacPsyContext *pctx = (AacPsyContext*) ctx->model_priv_data;
    AacPsyChannel *pch  = &pctx->ch[channel];
@ -508,7 +516,12 @@ static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size,
    ctx->pe.max = FFMAX(pe, ctx->pe.max);
    ctx->pe.min = FFMIN(pe, ctx->pe.min);

-    return FFMIN(ctx->frame_bits * bit_factor, ctx->frame_bits + size - bits);
+    /* NOTE: allocate a minimum of 1/8th average frame bits, to avoid
+     *   reservoir starvation from producing zero-bit frames
+     */
+    return FFMIN(
+        ctx->frame_bits * bit_factor,
+        FFMAX(ctx->frame_bits + size - bits, ctx->frame_bits / 8));
 }

 static float calc_pe_3gpp(AacPsyBand *band)
@ -678,8 +691,26 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,

    /* 5.6.1.3.2 "Calculation of the desired perceptual entropy" */
    ctx->ch[channel].entropy = pe;
+    if (ctx->avctx->flags & CODEC_FLAG_QSCALE) {
+        /* (2.5 * 120) achieves almost transparent rate, and we want to give
+         * ample room downwards, so we make that equivalent to QSCALE=2.4
+         */
+        desired_pe = pe * (ctx->avctx->global_quality ? ctx->avctx->global_quality : 120) / (2 * 2.5f * 120.0f);
+        desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+        desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+
+        /* PE slope smoothing */
+        if (ctx->bitres.bits > 0) {
+            desired_bits = FFMIN(2560, PSY_3GPP_PE_TO_BITS(desired_pe));
+            desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits); // reflect clipping
+        }
+
+        pctx->pe.max = FFMAX(pe, pctx->pe.max);
+        pctx->pe.min = FFMIN(pe, pctx->pe.min);
+    } else {
    desired_bits = calc_bit_demand(pctx, pe, ctx->bitres.bits, ctx->bitres.size, wi->num_windows == 8);
    desired_pe = PSY_3GPP_BITS_TO_PE(desired_bits);
+
    /* NOTE: PE correction is kept simple. During initial testing it had very
     *       little effect on the final bitrate. Probably a good idea to come
     *       back and do more testing later.
@ -687,6 +718,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel,
    if (ctx->bitres.bits > 0)
        desired_pe *= av_clipf(pctx->pe.previous / PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
                               0.85f, 1.15f);
+    }
    pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
    ctx->bitres.alloc = desired_bits;

--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@ -233,6 +233,11 @@ static inline av_const unsigned int ff_sqrt(unsigned int a)
 }
 #endif

+static inline av_const float ff_sqrf(float a)
+{
+    return a*a;
+}
+
 static inline int8_t ff_u8_to_s8(uint8_t a)
 {
    union {
--- a/libavcodec/mips/aaccoder_mips.c
+++ b/libavcodec/mips/aaccoder_mips.c
@ -178,6 +178,7 @@ static int find_min_book(float maxval, int sf) {
    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
    float Q34 = sqrtf(Q * sqrtf(Q));
    int qmaxval, cb;
+    qmaxval = maxval * Q34 + 0.4054f;
    if (qmaxval >= (FF_ARRAY_ELEMS(aac_maxval_cb)))
        cb = 11;
    else
@ -192,12 +193,13 @@ static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
                                                     PutBitContext *pb, const float *in, float *out,
                                                     const float *scaled, int size, int scale_idx,
                                                     int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;

    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
@ -262,26 +264,38 @@ static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,

        put_bits(pb, p_bits[curidx], p_codes[curidx]);

-        if (out) {
-           vec = &p_vec[curidx*4];
-           out[i+0] = vec[0] * IQ;
-           out[i+1] = vec[1] * IQ;
-           out[i+2] = vec[2] * IQ;
-           out[i+3] = vec[3] * IQ;
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = vec[0] * IQ;
+            e2 = vec[1] * IQ;
+            e3 = vec[2] * IQ;
+            e4 = vec[3] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
        }
    }
+    if (energy)
+        *energy = qenergy;
 }

 static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
                                                     PutBitContext *pb, const float *in, float *out,
                                                     const float *scaled, int size, int scale_idx,
                                                     int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;

    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
@ -365,26 +379,38 @@ static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
        v_bits  = p_bits[curidx] + count;
        put_bits(pb, v_bits, v_codes);

-        if (out) {
-           vec = &p_vec[curidx*4];
-           out[i+0] = copysignf(vec[0] * IQ, in[i+0]);
-           out[i+1] = copysignf(vec[1] * IQ, in[i+1]);
-           out[i+2] = copysignf(vec[2] * IQ, in[i+2]);
-           out[i+3] = copysignf(vec[3] * IQ, in[i+3]);
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec = &p_vec[curidx*4];
+            e1 = copysignf(vec[0] * IQ, in[i+0]);
+            e2 = copysignf(vec[1] * IQ, in[i+1]);
+            e3 = copysignf(vec[2] * IQ, in[i+2]);
+            e4 = copysignf(vec[3] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
        }
    }
+    if (energy)
+        *energy = qenergy;
 }

 static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
                                                     PutBitContext *pb, const float *in, float *out,
                                                     const float *scaled, int size, int scale_idx,
                                                     int cb, const float lambda, const float uplim,
-                                                     int *bits, const float ROUNDING)
+                                                     int *bits, float *energy, const float ROUNDING)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;

    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
@ -455,27 +481,39 @@ static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
        v_bits  = p_bits[curidx] + p_bits[curidx2];
        put_bits(pb, v_bits, v_codes);

-        if (out) {
-           vec1 = &p_vec[curidx*2 ];
-           vec2 = &p_vec[curidx2*2];
-           out[i+0] = vec1[0] * IQ;
-           out[i+1] = vec1[1] * IQ;
-           out[i+2] = vec2[0] * IQ;
-           out[i+3] = vec2[1] * IQ;
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx*2 ];
+            vec2 = &p_vec[curidx2*2];
+            e1 = vec1[0] * IQ;
+            e2 = vec1[1] * IQ;
+            e3 = vec2[0] * IQ;
+            e4 = vec2[1] * IQ;
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
        }
    }
+    if (energy)
+        *energy = qenergy;
 }

 static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
                                                      PutBitContext *pb, const float *in, float *out,
                                                      const float *scaled, int size, int scale_idx,
                                                      int cb, const float lambda, const float uplim,
-                                                      int *bits, const float ROUNDING)
+                                                      int *bits, float *energy, const float ROUNDING)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;

    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
@ -561,27 +599,39 @@ static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
        v_bits  = p_bits[curidx2] + count2;
        put_bits(pb, v_bits, v_codes);

-        if (out) {
-           vec1 = &p_vec[curidx1*2];
-           vec2 = &p_vec[curidx2*2];
-           out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
-           out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
-           out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
-           out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
        }
    }
+    if (energy)
+        *energy = qenergy;
 }

 static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
                                                       PutBitContext *pb, const float *in, float *out,
                                                       const float *scaled, int size, int scale_idx,
                                                       int cb, const float lambda, const float uplim,
-                                                       int *bits, const float ROUNDING)
+                                                       int *bits, float *energy, const float ROUNDING)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;

    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
@ -666,27 +716,39 @@ static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
        v_bits  = p_bits[curidx2] + count2;
        put_bits(pb, v_bits, v_codes);

-        if (out) {
-           vec1 = &p_vec[curidx1*2];
-           vec2 = &p_vec[curidx2*2];
-           out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
-           out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
-           out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
-           out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+        if (out || energy) {
+            float e1,e2,e3,e4;
+            vec1 = &p_vec[curidx1*2];
+            vec2 = &p_vec[curidx2*2];
+            e1 = copysignf(vec1[0] * IQ, in[i+0]);
+            e2 = copysignf(vec1[1] * IQ, in[i+1]);
+            e3 = copysignf(vec2[0] * IQ, in[i+2]);
+            e4 = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out) {
+                out[i+0] = e1;
+                out[i+1] = e2;
+                out[i+2] = e3;
+                out[i+3] = e4;
+            }
+            if (energy)
+                qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
        }
    }
+    if (energy)
+        *energy = qenergy;
 }

 static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
                                                   PutBitContext *pb, const float *in, float *out,
                                                   const float *scaled, int size, int scale_idx,
                                                   int cb, const float lambda, const float uplim,
-                                                   int *bits, const float ROUNDING)
+                                                   int *bits, float *energy, const float ROUNDING)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    int qc1, qc2, qc3, qc4;
+    float qenergy = 0.0f;

    uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
    uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
@ -772,13 +834,22 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
            v_bits  = p_bits[curidx2] + count2;
            put_bits(pb, v_bits, v_codes);

-            if (out) {
-               vec1 = &p_vectors[curidx*2 ];
-               vec2 = &p_vectors[curidx2*2];
-               out[i+0] = copysignf(vec1[0] * IQ, in[i+0]);
-               out[i+1] = copysignf(vec1[1] * IQ, in[i+1]);
-               out[i+2] = copysignf(vec2[0] * IQ, in[i+2]);
-               out[i+3] = copysignf(vec2[1] * IQ, in[i+3]);
+            if (out || energy) {
+                float e1,e2,e3,e4;
+                vec1 = &p_vectors[curidx*2 ];
+                vec2 = &p_vectors[curidx2*2];
+                e1 = copysignf(vec1[0] * IQ, in[i+0]);
+                e2 = copysignf(vec1[1] * IQ, in[i+1]);
+                e3 = copysignf(vec2[0] * IQ, in[i+2]);
+                e4 = copysignf(vec2[1] * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
            }
        }
    } else {
@ -892,23 +963,34 @@ static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
                put_bits(pb, len * 2 - 3, v_codes);
            }

-            if (out) {
-               vec1 = &p_vectors[curidx*2];
-               vec2 = &p_vectors[curidx2*2];
-               out[i+0] = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
-               out[i+1] = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
-               out[i+2] = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
-               out[i+3] = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+            if (out || energy) {
+                float e1, e2, e3, e4;
+                vec1 = &p_vectors[curidx*2];
+                vec2 = &p_vectors[curidx2*2];
+                e1 = copysignf(c1 * cbrtf(c1) * IQ, in[i+0]);
+                e2 = copysignf(c2 * cbrtf(c2) * IQ, in[i+1]);
+                e3 = copysignf(c3 * cbrtf(c3) * IQ, in[i+2]);
+                e4 = copysignf(c4 * cbrtf(c4) * IQ, in[i+3]);
+                if (out) {
+                    out[i+0] = e1;
+                    out[i+1] = e2;
+                    out[i+2] = e3;
+                    out[i+3] = e4;
+                }
+                if (energy)
+                    qenergy += (e1*e1 + e2*e2) + (e3*e3 + e4*e4);
            }
        }
    }
+    if (energy)
+        *energy = qenergy;
 }

 static void quantize_and_encode_band_cost_NONE_mips(struct AACEncContext *s,
                                                         PutBitContext *pb, const float *in, float *out,
                                                         const float *scaled, int size, int scale_idx,
                                                         int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) {
+                                                         int *bits, float *energy, const float ROUNDING) {
    av_assert0(0);
 }

@ -916,7 +998,7 @@ static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
                                                         PutBitContext *pb, const float *in, float *out,
                                                         const float *scaled, int size, int scale_idx,
                                                         int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) {
+                                                         int *bits, float *energy, const float ROUNDING) {
    int i;
    if (bits)
        *bits = 0;
@ -928,13 +1010,15 @@ static void quantize_and_encode_band_cost_ZERO_mips(struct AACEncContext *s,
           out[i+3] = 0.0f;
        }
    }
+    if (energy)
+        *energy = 0.0f;
 }

 static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
                                                         PutBitContext *pb, const float *in, float *out,
                                                         const float *scaled, int size, int scale_idx,
                                                         int cb, const float lambda, const float uplim,
-                                                         int *bits, const float ROUNDING) = {
+                                                         int *bits, float *energy, const float ROUNDING) = {
    quantize_and_encode_band_cost_ZERO_mips,
    quantize_and_encode_band_cost_SQUAD_mips,
    quantize_and_encode_band_cost_SQUAD_mips,
@ -955,17 +1039,17 @@ static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s

 #define quantize_and_encode_band_cost(                                       \
                                s, pb, in, out, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits, ROUNDING)               \
+                                lambda, uplim, bits, energy, ROUNDING)       \
    quantize_and_encode_band_cost_arr[cb](                                   \
                                s, pb, in, out, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits, ROUNDING)
+                                lambda, uplim, bits, energy, ROUNDING)

 static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
                                          const float *in, float *out, int size, int scale_idx,
                                          int cb, const float lambda, int rtz)
 {
    quantize_and_encode_band_cost(s, pb, in, out, NULL, size, scale_idx, cb, lambda,
-                                  INFINITY, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
+                                  INFINITY, NULL, NULL, (rtz) ? ROUND_TO_ZERO : ROUND_STANDARD);
 }

 /**
@ -1445,7 +1529,7 @@ static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
 static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
                                     const float *scaled, int size, int scale_idx,
                                     int cb, const float lambda, const float uplim,
-                                     int *bits, int rtz)
+                                     int *bits, float *energy, int rtz)
 {
    return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
 }
@ -1458,7 +1542,7 @@ static float get_band_cost_ZERO_mips(struct AACEncContext *s,
                                     PutBitContext *pb, const float *in,
                                     const float *scaled, int size, int scale_idx,
                                     int cb, const float lambda, const float uplim,
-                                     int *bits)
+                                     int *bits, float *energy)
 {
    int i;
    float cost = 0;
@ -1471,6 +1555,8 @@ static float get_band_cost_ZERO_mips(struct AACEncContext *s,
    }
    if (bits)
        *bits = 0;
+    if (energy)
+        *energy = 0.0f;
    return cost * lambda;
 }

@ -1478,7 +1564,7 @@ static float get_band_cost_NONE_mips(struct AACEncContext *s,
                                     PutBitContext *pb, const float *in,
                                     const float *scaled, int size, int scale_idx,
                                     int cb, const float lambda, const float uplim,
-                                     int *bits)
+                                     int *bits, float *energy)
 {
    av_assert0(0);
    return 0;
@ -1488,12 +1574,13 @@ static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
                                      PutBitContext *pb, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    float cost = 0;
+    float qenergy = 0.0f;
    int qc1, qc2, qc3, qc4;
    int curbits = 0;

@ -1560,6 +1647,9 @@ static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
        curbits += p_bits[curidx];
        vec     = &p_codes[curidx*4];

+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
        __asm__ volatile (
            ".set push                                  \n\t"
            ".set noreorder                             \n\t"
@ -1594,6 +1684,8 @@ static float get_band_cost_SQUAD_mips(struct AACEncContext *s,

    if (bits)
        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
    return cost * lambda + curbits;
 }

@ -1601,12 +1693,13 @@ static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
                                      PutBitContext *pb, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    float cost = 0;
+    float qenergy = 0.0f;
    int curbits = 0;
    int qc1, qc2, qc3, qc4;

@ -1659,6 +1752,9 @@ static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
        curbits += uquad_sign_bits[curidx];
        vec     = &p_codes[curidx*4];

+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec[2]*vec[2] + vec[3]*vec[3];
+
        __asm__ volatile (
            ".set push                                  \n\t"
            ".set noreorder                             \n\t"
@ -1696,6 +1792,8 @@ static float get_band_cost_UQUAD_mips(struct AACEncContext *s,

    if (bits)
        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
    return cost * lambda + curbits;
 }

@ -1703,12 +1801,13 @@ static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
                                      PutBitContext *pb, const float *in,
                                      const float *scaled, int size, int scale_idx,
                                      int cb, const float lambda, const float uplim,
-                                      int *bits)
+                                      int *bits, float *energy)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    float cost = 0;
+    float qenergy = 0.0f;
    int qc1, qc2, qc3, qc4;
    int curbits = 0;

@ -1780,6 +1879,9 @@ static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
        vec     = &p_codes[curidx*2];
        vec2    = &p_codes[curidx2*2];

+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
        __asm__ volatile (
            ".set push                                  \n\t"
            ".set noreorder                             \n\t"
@ -1814,6 +1916,8 @@ static float get_band_cost_SPAIR_mips(struct AACEncContext *s,

    if (bits)
        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
    return cost * lambda + curbits;
 }

@ -1821,12 +1925,13 @@ static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
                                       PutBitContext *pb, const float *in,
                                       const float *scaled, int size, int scale_idx,
                                       int cb, const float lambda, const float uplim,
-                                       int *bits)
+                                       int *bits, float *energy)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    float cost = 0;
+    float qenergy = 0.0f;
    int qc1, qc2, qc3, qc4;
    int curbits = 0;

@ -1910,6 +2015,9 @@ static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
        curbits += upair7_sign_bits[curidx2];
        vec2    = &p_codes[curidx2*2];

+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
        __asm__ volatile (
            ".set push                                          \n\t"
            ".set noreorder                                     \n\t"
@ -1947,6 +2055,8 @@ static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,

    if (bits)
        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
    return cost * lambda + curbits;
 }

@ -1954,12 +2064,13 @@ static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
                                        PutBitContext *pb, const float *in,
                                        const float *scaled, int size, int scale_idx,
                                        int cb, const float lambda, const float uplim,
-                                        int *bits)
+                                        int *bits, float *energy)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    int i;
    float cost = 0;
+    float qenergy = 0.0f;
    int qc1, qc2, qc3, qc4;
    int curbits = 0;

@ -2043,6 +2154,9 @@ static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
        vec     = &p_codes[curidx*2];
        vec2    = &p_codes[curidx2*2];

+        qenergy += vec[0]*vec[0] + vec[1]*vec[1]
+                +  vec2[0]*vec2[0] + vec2[1]*vec2[1];
+
        __asm__ volatile (
            ".set push                                          \n\t"
            ".set noreorder                                     \n\t"
@ -2080,6 +2194,8 @@ static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,

    if (bits)
        *bits = curbits;
+    if (energy)
+        *energy = qenergy * (IQ*IQ);
    return cost * lambda + curbits;
 }

@ -2087,13 +2203,14 @@ static float get_band_cost_ESC_mips(struct AACEncContext *s,
                                    PutBitContext *pb, const float *in,
                                    const float *scaled, int size, int scale_idx,
                                    int cb, const float lambda, const float uplim,
-                                    int *bits)
+                                    int *bits, float *energy)
 {
    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
    const float CLIPPED_ESCAPE = 165140.0f * IQ;
    int i;
    float cost = 0;
+    float qenergy = 0.0f;
    int qc1, qc2, qc3, qc4;
    int curbits = 0;

@ -2103,7 +2220,7 @@ static float get_band_cost_ESC_mips(struct AACEncContext *s,
    for (i = 0; i < size; i += 4) {
        const float *vec, *vec2;
        int curidx, curidx2;
-        float t1, t2, t3, t4;
+        float t1, t2, t3, t4, V;
        float di1, di2, di3, di4;
        int cond0, cond1, cond2, cond3;
        int c1, c2, c3, c4;
@ -2175,38 +2292,54 @@ static float get_band_cost_ESC_mips(struct AACEncContext *s,
        if (cond0) {
            if (t1 >= CLIPPED_ESCAPE) {
                di1 = t1 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
            } else {
-                di1 = t1 - c1 * cbrtf(c1) * IQ;
+                di1 = t1 - (V = c1 * cbrtf(c1) * IQ);
+                qenergy += V*V;
            }
-        } else
-            di1 = t1 - vec[0] * IQ;
+        } else {
+            di1 = t1 - (V = vec[0] * IQ);
+            qenergy += V*V;
+        }

        if (cond1) {
            if (t2 >= CLIPPED_ESCAPE) {
                di2 = t2 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
            } else {
-                di2 = t2 - c2 * cbrtf(c2) * IQ;
+                di2 = t2 - (V = c2 * cbrtf(c2) * IQ);
+                qenergy += V*V;
            }
-        } else
-            di2 = t2 - vec[1] * IQ;
+        } else {
+            di2 = t2 - (V = vec[1] * IQ);
+            qenergy += V*V;
+        }

        if (cond2) {
            if (t3 >= CLIPPED_ESCAPE) {
                di3 = t3 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
            } else {
-                di3 = t3 - c3 * cbrtf(c3) * IQ;
+                di3 = t3 - (V = c3 * cbrtf(c3) * IQ);
+                qenergy += V*V;
            }
-        } else
-            di3 = t3 - vec2[0] * IQ;
+        } else {
+            di3 = t3 - (V = vec2[0] * IQ);
+            qenergy += V*V;
+        }

        if (cond3) {
            if (t4 >= CLIPPED_ESCAPE) {
                di4 = t4 - CLIPPED_ESCAPE;
+                qenergy += CLIPPED_ESCAPE*CLIPPED_ESCAPE;
            } else {
-                di4 = t4 - c4 * cbrtf(c4) * IQ;
+                di4 = t4 - (V = c4 * cbrtf(c4) * IQ);
+                qenergy += V*V;
            }
-        } else
-            di4 = t4 - vec2[1]*IQ;
+        } else {
+            di4 = t4 - (V = vec2[1]*IQ);
+            qenergy += V*V;
+        }

        cost += di1 * di1 + di2 * di2
                + di3 * di3 + di4 * di4;
@ -2221,7 +2354,7 @@ static float (*const get_band_cost_arr[])(struct AACEncContext *s,
                                          PutBitContext *pb, const float *in,
                                          const float *scaled, int size, int scale_idx,
                                          int cb, const float lambda, const float uplim,
-                                          int *bits) = {
+                                          int *bits, float *energy) = {
    get_band_cost_ZERO_mips,
    get_band_cost_SQUAD_mips,
    get_band_cost_SQUAD_mips,
@ -2242,17 +2375,87 @@ static float (*const get_band_cost_arr[])(struct AACEncContext *s,

 #define get_band_cost(                                  \
                                s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)                    \
+                                lambda, uplim, bits, energy)            \
    get_band_cost_arr[cb](                              \
                                s, pb, in, scaled, size, scale_idx, cb, \
-                                lambda, uplim, bits)
+                                lambda, uplim, bits, energy)

 static float quantize_band_cost(struct AACEncContext *s, const float *in,
                                const float *scaled, int size, int scale_idx,
                                int cb, const float lambda, const float uplim,
-                                int *bits, int rtz)
+                                int *bits, float *energy, int rtz)
 {
-    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
+    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits, energy);
+}
+
+static float find_form_factor(int group_len, int swb_size, float thresh, const float *scaled, float nzslope) {
+    const float iswb_size = 1.0f / swb_size;
+    const float iswb_sizem1 = 1.0f / (swb_size - 1);
+    const float ethresh = thresh, iethresh = 1.0f / ethresh;
+    float form = 0.0f, weight = 0.0f;
+    int w2, i;
+    for (w2 = 0; w2 < group_len; w2++) {
+        float e = 0.0f, e2 = 0.0f, var = 0.0f, maxval = 0.0f;
+        float nzl = 0;
+        for (i = 0; i < swb_size; i+=4) {
+            float s1 = fabsf(scaled[w2*128+i  ]);
+            float s2 = fabsf(scaled[w2*128+i+1]);
+            float s3 = fabsf(scaled[w2*128+i+2]);
+            float s4 = fabsf(scaled[w2*128+i+3]);
+            maxval = FFMAX(maxval, FFMAX(FFMAX(s1, s2), FFMAX(s3, s4)));
+            e += (s1+s2)+(s3+s4);
+            s1 *= s1;
+            s2 *= s2;
+            s3 *= s3;
+            s4 *= s4;
+            e2 += (s1+s2)+(s3+s4);
+            /* We really don't want a hard non-zero-line count, since
+             * even below-threshold lines do add up towards band spectral power.
+             * So, fall steeply towards zero, but smoothly
+             */
+            if (s1 >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s1 * iethresh, nzslope);
+            }
+            if (s2 >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s2 * iethresh, nzslope);
+            }
+            if (s3 >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s3 * iethresh, nzslope);
+            }
+            if (s4 >= ethresh) {
+                nzl += 1.0f;
+            } else {
+                nzl += powf(s4 * iethresh, nzslope);
+            }
+        }
+        if (e2 > thresh) {
+            float frm;
+            e *= iswb_size;
+
+            /** compute variance */
+            for (i = 0; i < swb_size; i++) {
+                float d = fabsf(scaled[w2*128+i]) - e;
+                var += d*d;
+            }
+            var = sqrtf(var * iswb_sizem1);
+
+            e2 *= iswb_size;
+            frm = e / FFMIN(e+4*var,maxval);
+            form += e2 * sqrtf(frm) / FFMAX(0.5f,nzl);
+            weight += e2;
+        }
+    }
+    if (weight > 0) {
+        return form / weight;
+    } else {
+        return 1.0f;
+    }
 }

 #include "libavcodec/aaccoder_twoloop.h"
@ -2305,25 +2508,25 @@ static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe)
                                                sce0->ics.swb_sizes[g],
                                                sce0->sf_idx[(w+w2)*16+g],
                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / band0->threshold, INFINITY, NULL, 0);
+                                                lambda / band0->threshold, INFINITY, NULL, NULL, 0);
                    dist1 += quantize_band_cost(s, &sce1->coeffs[start + (w+w2)*128],
                                                R34,
                                                sce1->ics.swb_sizes[g],
                                                sce1->sf_idx[(w+w2)*16+g],
                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / band1->threshold, INFINITY, NULL, 0);
+                                                lambda / band1->threshold, INFINITY, NULL, NULL, 0);
                    dist2 += quantize_band_cost(s, M,
                                                M34,
                                                sce0->ics.swb_sizes[g],
                                                sce0->sf_idx[(w+w2)*16+g],
                                                sce0->band_type[(w+w2)*16+g],
-                                                lambda / maxthr, INFINITY, NULL, 0);
+                                                lambda / maxthr, INFINITY, NULL, NULL, 0);
                    dist2 += quantize_band_cost(s, S,
                                                S34,
                                                sce1->ics.swb_sizes[g],
                                                sce1->sf_idx[(w+w2)*16+g],
                                                sce1->band_type[(w+w2)*16+g],
-                                                lambda / minthr, INFINITY, NULL, 0);
+                                                lambda / minthr, INFINITY, NULL, NULL, 0);
                }
                cpe->ms_mask[w*16+g] = dist2 < dist1;
            }
--- a/libavcodec/psymodel.c
+++ b/libavcodec/psymodel.c
@ -109,25 +109,21 @@ av_cold struct FFPsyPreprocessContext* ff_psy_preprocess_init(AVCodecContext *av
        return NULL;
    ctx->avctx = avctx;

+    /* AAC has its own LP method */
+    if (avctx->codec_id != AV_CODEC_ID_AAC) {
    if (avctx->cutoff > 0)
        cutoff_coeff = 2.0 * avctx->cutoff / avctx->sample_rate;

-    if (!cutoff_coeff && avctx->codec_id == AV_CODEC_ID_AAC)
-        cutoff_coeff = 2.0 * AAC_CUTOFF(avctx) / avctx->sample_rate;
-
    if (cutoff_coeff && cutoff_coeff < 0.98)
    ctx->fcoeffs = ff_iir_filter_init_coeffs(avctx, FF_FILTER_TYPE_BUTTERWORTH,
                                             FF_FILTER_MODE_LOWPASS, FILT_ORDER,
                                             cutoff_coeff, 0.0, 0.0);
    if (ctx->fcoeffs) {
-        ctx->fstate = av_mallocz_array(sizeof(ctx->fstate[0]), avctx->channels);
-        if (!ctx->fstate) {
-            av_free(ctx);
-            return NULL;
-        }
+        ctx->fstate = av_mallocz(sizeof(ctx->fstate[0]) * avctx->channels);
        for (i = 0; i < avctx->channels; i++)
            ctx->fstate[i] = ff_iir_filter_init_state(FILT_ORDER);
    }
+    }

    ff_iir_filter_init(&ctx->fiir);

--- a/libavcodec/psymodel.h
+++ b/libavcodec/psymodel.h
@ -29,7 +29,20 @@
 /** maximum number of channels */
 #define PSY_MAX_CHANS 20

-#define AAC_CUTOFF(s) ((s)->bit_rate ? FFMIN3(4000 + (s)->bit_rate/8, 12000 + (s)->bit_rate/32, (s)->sample_rate / 2) : ((s)->sample_rate / 2))
+/* cutoff for VBR is purposedly increased, since LP filtering actually
+ * hinders VBR performance rather than the opposite
+ */
+#define AAC_CUTOFF_FROM_BITRATE(bit_rate,channels,sample_rate) (bit_rate ? FFMIN3(FFMIN3( \
+    FFMAX(bit_rate/channels/5, bit_rate/channels*15/32 - 5500), \
+    3000 + bit_rate/channels/4, \
+    12000 + bit_rate/channels/16), \
+    22000, \
+    sample_rate / 2): (sample_rate / 2))
+#define AAC_CUTOFF(s) ( \
+    (s->flags & CODEC_FLAG_QSCALE) \
+    ? s->sample_rate / 2 \
+    : AAC_CUTOFF_FROM_BITRATE(s->bit_rate, s->channels, s->sample_rate) \
+)

 /**
 * single band psychoacoustic information
--- a/tests/fate/aac.mak
+++ b/tests/fate/aac.mak
@ -146,7 +146,7 @@ fate-aac-aref-encode: CMD = enc_dec_pcm adts wav s16le $(REF) -strict -2 -c:a aa
 fate-aac-aref-encode: CMP = stddev
 fate-aac-aref-encode: REF = ./tests/data/asynth-44100-2.wav
 fate-aac-aref-encode: CMP_SHIFT = -4096
-fate-aac-aref-encode: CMP_TARGET = 584
+fate-aac-aref-encode: CMP_TARGET = 1127
 fate-aac-aref-encode: SIZE_TOLERANCE = 2464
 fate-aac-aref-encode: FUZZ = 6

@ -155,51 +155,52 @@ fate-aac-ln-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-ref
 fate-aac-ln-encode: CMP = stddev
 fate-aac-ln-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-ln-encode: CMP_SHIFT = -4096
-fate-aac-ln-encode: CMP_TARGET = 68
+fate-aac-ln-encode: CMP_TARGET = 80
 fate-aac-ln-encode: SIZE_TOLERANCE = 3560
+fate-aac-ln-encode: FUZZ = 30

 FATE_AAC_ENCODE += fate-aac-ln-encode-128k
-fate-aac-ln-encode-128k: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k
+fate-aac-ln-encode-128k: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
 fate-aac-ln-encode-128k: CMP = stddev
 fate-aac-ln-encode-128k: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-ln-encode-128k: CMP_SHIFT = -4096
-fate-aac-ln-encode-128k: CMP_TARGET = 638
+fate-aac-ln-encode-128k: CMP_TARGET = 745
 fate-aac-ln-encode-128k: SIZE_TOLERANCE = 3560
 fate-aac-ln-encode-128k: FUZZ = 5

 FATE_AAC_ENCODE += fate-aac-pns-encode
-fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 1 -aac_is 0 -b:a 128k
+fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 1 -aac_is 0 -b:a 128k -cutoff 22050
 fate-aac-pns-encode: CMP = stddev
 fate-aac-pns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-pns-encode: CMP_SHIFT = -4096
-fate-aac-pns-encode: CMP_TARGET = 623.77
+fate-aac-pns-encode: CMP_TARGET = 695
 fate-aac-pns-encode: SIZE_TOLERANCE = 3560
 fate-aac-pns-encode: FUZZ = 25

 FATE_AAC_ENCODE += fate-aac-tns-encode
-fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_tns 1 -aac_is 0 -aac_pns 0 -b:a 128k
+fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_tns 1 -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
 fate-aac-tns-encode: CMP = stddev
 fate-aac-tns-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-tns-encode: CMP_SHIFT = -4096
-fate-aac-tns-encode: CMP_TARGET = 644.50
+fate-aac-tns-encode: CMP_TARGET = 768
 fate-aac-tns-encode: FUZZ = 2.8
 fate-aac-tns-encode: SIZE_TOLERANCE = 3560

 FATE_AAC_ENCODE += fate-aac-is-encode
-fate-aac-is-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 0 -aac_is 1 -b:a 128k
+fate-aac-is-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -c:a aac -aac_pns 0 -aac_is 1 -b:a 128k -cutoff 22050
 fate-aac-is-encode: CMP = stddev
 fate-aac-is-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-is-encode: CMP_SHIFT = -4096
-fate-aac-is-encode: CMP_TARGET = 614.04
+fate-aac-is-encode: CMP_TARGET = 582
 fate-aac-is-encode: SIZE_TOLERANCE = 3560
 fate-aac-is-encode: FUZZ = 1

 FATE_AAC_ENCODE += fate-aac-pred-encode
-fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -profile:a aac_main -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k
+fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le $(TARGET_SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav -strict -2 -profile:a aac_main -c:a aac -aac_is 0 -aac_pns 0 -b:a 128k -cutoff 22050
 fate-aac-pred-encode: CMP = stddev
 fate-aac-pred-encode: REF = $(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-pred-encode: CMP_SHIFT = -4096
-fate-aac-pred-encode: CMP_TARGET = 657
+fate-aac-pred-encode: CMP_TARGET = 790
 fate-aac-pred-encode: FUZZ = 5
 fate-aac-pred-encode: SIZE_TOLERANCE = 3560