Use memcpy in the special case amp==0 (no amplification) and optimize the code in
the performance-critical loop. Intrestingly, using the likely()/unlikely() macros made the
code slower.
Results (three runs on identical input data on a 32bit x86 machine under Linux, gcc-4.4.0):
old with --amp 3:
0m0.776s 0m0.790s 0m0.812s, avg: 792
new with --amp 3:
0m0.456s 0m0.492s 0m0.477s, avg: 475
speedup: 1.67
old with --amp 0:
0m0.791s 0m0.808s 0m0.810s, avg: 803
new with --amp 0:
0m0.100s 0m0.103s 0m0.094s, avg: 99
speedup: 8.1
static ssize_t amp_convert(char *inbuf, size_t inbuf_len, struct filter_node *fn)
{
static ssize_t amp_convert(char *inbuf, size_t inbuf_len, struct filter_node *fn)
{
- size_t i, length = PARA_MIN((inbuf_len / 2) * 2,
- (fn->bufsize - fn->loaded) / 2 * 2);
+ size_t i, length = PARA_MIN((inbuf_len / 2),
+ (fn->bufsize - fn->loaded) / 2);
struct private_amp_data *pad = fn->private_data;
int16_t *ip = (int16_t *)inbuf, *op = (int16_t *)(fn->buf + fn->loaded);
struct private_amp_data *pad = fn->private_data;
int16_t *ip = (int16_t *)inbuf, *op = (int16_t *)(fn->buf + fn->loaded);
+ int factor = 64 + pad->amp;
- for (i = 0; i < length / 2; i++) {
- int x = (PARA_ABS(*ip) * (64 + pad->amp)) >> 6;
- *op++ = *ip++ > 0? PARA_MIN(x, 32767) : PARA_MAX(-x, -32768);
+
+ if (pad->amp == 0) {
+ memcpy(op, ip, length * 2);
+ goto out;
+ }
+ for (i = 0; i < length; i++) {
+ int x = (ip[i] * factor) >> 6;
+
+ op[i] = x;
+ if (op[i] != x)
+ op[i] = (x >= 32768)? 32767 : -32768;
- fn->loaded += length;
- return length;
+out:
+ fn->loaded += length * 2;
+ return length * 2;
}
static void amp_close(struct filter_node *fn)
}
static void amp_close(struct filter_node *fn)