#ifndef INCLUDED_JPEG_DECODER_H
#define INCLUDED_JPEG_DECODER_H

// This is a cosmetic restructing and port to C++ class of 'NanoJPEG', found
// at http://keyj.s2000.ws/?p=137. It's been made somewhat thread safe in that
// all context information is pulled into an object, rather than being global
// as the original was. Other than that, the original is superior in
// configurability, comments, cleanliness, portability, etc. and should be
// preferred. The only other possible benefit this version can claim is that
// it's crammed into one header file.
//
// Scott Graham <scott.jpegdecoder@h4ck3r.net>
//
// The original license follows:
//
// NanoJPEG -- KeyJ's Tiny Baseline JPEG Decoder
// version 1.0 (2009-04-29)
// by Martin J. Fiedler <martin.fiedler@gmx.net>
//
// This software is published under the terms of KeyJ's Research License,
// version 0.2. Usage of this software is subject to the following conditions:
// 0. There's no warranty whatsoever. The author(s) of this software can not
//    be held liable for any damages that occur when using this software.
// 1. This software may be used freely for both non-commercial and commercial
//    purposes.
// 2. This software may be redistributed freely as long as no fees are charged
//    for the distribution and this license information is included.
// 3. This software may be modified freely except for this license information,
//    which must not be changed in any way.
// 4. If anything other than configuration, indentation or comments have been
//    altered in the code, the original author(s) must receive a copy of the
//    modified code.

#include <stdlib.h>
#include <string.h>

#ifdef _MSC_VER
    #pragma warning(push)
    #pragma warning(disable: 4127) // conditional expression is constant
    #pragma warning(disable: 4706) // assignment within conditional
#endif


namespace Jpeg
{
    class Decoder
    {
    public:
        enum DecodeResult
        {
            OK = 0,        // decoding successful
            NotAJpeg,      // not a JPEG file
            Unsupported,   // unsupported format
            OutOfMemory,   // out of memory
            InternalError, // internal error
            SyntaxError,   // syntax error
            Internal_Finished, // used internally, will never be reported
        };

        // decode the raw data. object is very large, and probably shouldn't
        // go on the stack.
        Decoder(const unsigned char* data, size_t size, void *(*allocFunc)(size_t) = malloc, void (*freeFunc)(void*) = free);
        ~Decoder();

        // the result of decode
        DecodeResult GetResult() const;

        // all remaining functions below are only valid if GetResult() == OK.

        int GetWidth() const;
        int GetHeight() const;
        bool IsColor() const;

        // if IsColor() then 24bit as R,G,B bytes
        // else 8 bit luminance
        unsigned char* GetImage() const;

        // in bytes
        size_t GetImageSize() const;
        
        //////////////////////////////////////////////////////////////////////
        //////////////////////////////////////////////////////////////////////
        //////////////////////////////////////////////////////////////////////
        //
        // Implementation follows
        //
        //////////////////////////////////////////////////////////////////////
        //////////////////////////////////////////////////////////////////////
        //////////////////////////////////////////////////////////////////////

    private:
        struct VlcCode {
            unsigned char bits, code;
        };

        struct Component {
            int cid;
            int ssx, ssy;
            int width, height;
            int stride;
            int qtsel;
            int actabsel, dctabsel;
            int dcpred;
            unsigned char *pixels;
        };

        struct Context {
            DecodeResult error;
            const unsigned char *pos;
            int size;
            int length;
            int width, height;
            int mbwidth, mbheight;
            int mbsizex, mbsizey;
            int ncomp;
            Component comp[3];
            int qtused, qtavail;
            unsigned char qtab[4][64];
            VlcCode vlctab[4][65536];
            int buf, bufbits;
            int block[64];
            int rstinterval;
            unsigned char *rgb;
        };

        Context ctx;
        char ZZ[64];
        void *(*AllocMem)(size_t);
        void (*FreeMem)(void*);


        inline unsigned char _Clip(const int x) {
            return (x < 0) ? 0 : ((x > 0xFF) ? 0xFF : (unsigned char) x);
        }

        enum {
            W1 = 2841,
            W2 = 2676,
            W3 = 2408,
            W5 = 1609,
            W6 = 1108,
            W7 = 565,
        };

        inline void _RowIDCT(int* blk) {
            int x0, x1, x2, x3, x4, x5, x6, x7, x8;
            if (!((x1 = blk[4] << 11)
                | (x2 = blk[6])
                | (x3 = blk[2])
                | (x4 = blk[1])
                | (x5 = blk[7])
                | (x6 = blk[5])
                | (x7 = blk[3])))
            {
                blk[0] = blk[1] = blk[2] = blk[3] = blk[4] = blk[5] = blk[6] = blk[7] = blk[0] << 3;
                return;
            }
            x0 = (blk[0] << 11) + 128;
            x8 = W7 * (x4 + x5);
            x4 = x8 + (W1 - W7) * x4;
            x5 = x8 - (W1 + W7) * x5;
            x8 = W3 * (x6 + x7);
            x6 = x8 - (W3 - W5) * x6;
            x7 = x8 - (W3 + W5) * x7;
            x8 = x0 + x1;
            x0 -= x1;
            x1 = W6 * (x3 + x2);
            x2 = x1 - (W2 + W6) * x2;
            x3 = x1 + (W2 - W6) * x3;
            x1 = x4 + x6;
            x4 -= x6;
            x6 = x5 + x7;
            x5 -= x7;
            x7 = x8 + x3;
            x8 -= x3;
            x3 = x0 + x2;
            x0 -= x2;
            x2 = (181 * (x4 + x5) + 128) >> 8;
            x4 = (181 * (x4 - x5) + 128) >> 8;
            blk[0] = (x7 + x1) >> 8;
            blk[1] = (x3 + x2) >> 8;
            blk[2] = (x0 + x4) >> 8;
            blk[3] = (x8 + x6) >> 8;
            blk[4] = (x8 - x6) >> 8;
            blk[5] = (x0 - x4) >> 8;
            blk[6] = (x3 - x2) >> 8;
            blk[7] = (x7 - x1) >> 8;
        }

        inline void _ColIDCT(const int* blk, unsigned char *out, int stride) {
            int x0, x1, x2, x3, x4, x5, x6, x7, x8;
            if (!((x1 = blk[8*4] << 8)
                | (x2 = blk[8*6])
                | (x3 = blk[8*2])
                | (x4 = blk[8*1])
                | (x5 = blk[8*7])
                | (x6 = blk[8*5])
                | (x7 = blk[8*3])))
            {
                x1 = _Clip(((blk[0] + 32) >> 6) + 128);
                for (x0 = 8;  x0;  --x0) {
                    *out = (unsigned char) x1;
                    out += stride;
                }
                return;
            }
            x0 = (blk[0] << 8) + 8192;
            x8 = W7 * (x4 + x5) + 4;
            x4 = (x8 + (W1 - W7) * x4) >> 3;
            x5 = (x8 - (W1 + W7) * x5) >> 3;
            x8 = W3 * (x6 + x7) + 4;
            x6 = (x8 - (W3 - W5) * x6) >> 3;
            x7 = (x8 - (W3 + W5) * x7) >> 3;
            x8 = x0 + x1;
            x0 -= x1;
            x1 = W6 * (x3 + x2) + 4;
            x2 = (x1 - (W2 + W6) * x2) >> 3;
            x3 = (x1 + (W2 - W6) * x3) >> 3;
            x1 = x4 + x6;
            x4 -= x6;
            x6 = x5 + x7;
            x5 -= x7;
            x7 = x8 + x3;
            x8 -= x3;
            x3 = x0 + x2;
            x0 -= x2;
            x2 = (181 * (x4 + x5) + 128) >> 8;
            x4 = (181 * (x4 - x5) + 128) >> 8;
            *out = _Clip(((x7 + x1) >> 14) + 128);  out += stride;
            *out = _Clip(((x3 + x2) >> 14) + 128);  out += stride;
            *out = _Clip(((x0 + x4) >> 14) + 128);  out += stride;
            *out = _Clip(((x8 + x6) >> 14) + 128);  out += stride;
            *out = _Clip(((x8 - x6) >> 14) + 128);  out += stride;
            *out = _Clip(((x0 - x4) >> 14) + 128);  out += stride;
            *out = _Clip(((x3 - x2) >> 14) + 128);  out += stride;
            *out = _Clip(((x7 - x1) >> 14) + 128);
        }

        #define JPEG_DECODER_THROW(e) do { ctx.error = e; return; } while (0)

        inline int _ShowBits(int bits) {
            unsigned char newbyte;
            if (!bits) return 0;
            while (ctx.bufbits < bits) {
                if (ctx.size <= 0) {
                    ctx.buf = (ctx.buf << 8) | 0xFF;
                    ctx.bufbits += 8;
                    continue;
                }
                newbyte = *ctx.pos++;
                ctx.size--;
                ctx.bufbits += 8;
                ctx.buf = (ctx.buf << 8) | newbyte;
                if (newbyte == 0xFF) {
                    if (ctx.size) {
                        unsigned char marker = *ctx.pos++;
                        ctx.size--;
                        switch (marker) {
                            case 0:    break;
                            case 0xD9: ctx.size = 0; break;
                            default:
                                if ((marker & 0xF8) != 0xD0)
                                    ctx.error = SyntaxError;
                                else {
                                    ctx.buf = (ctx.buf << 8) | marker;
                                    ctx.bufbits += 8;
                                }
                        }
                    } else
                        ctx.error = SyntaxError;
                }
            }
            return (ctx.buf >> (ctx.bufbits - bits)) & ((1 << bits) - 1);
        }

        inline void _SkipBits(int bits) {
            if (ctx.bufbits < bits)
                (void) _ShowBits(bits);
            ctx.bufbits -= bits;
        }

        inline int _GetBits(int bits) {
            int res = _ShowBits(bits);
            _SkipBits(bits);
            return res;
        }

        inline void _ByteAlign(void) {
            ctx.bufbits &= 0xF8;
        }

        inline void _Skip(int count) {
            ctx.pos += count;
            ctx.size -= count;
            ctx.length -= count;
            if (ctx.size < 0) ctx.error = SyntaxError;
        }

        inline unsigned short _Decode16(const unsigned char *pos) {
            return (pos[0] << 8) | pos[1];
        }

        inline void _DecodeLength(void) {
            if (ctx.size < 2) JPEG_DECODER_THROW(SyntaxError);
            ctx.length = _Decode16(ctx.pos);
            if (ctx.length > ctx.size) JPEG_DECODER_THROW(SyntaxError);
            _Skip(2);
        }

        inline void _SkipMarker(void) {
            _DecodeLength();
            _Skip(ctx.length);
        }

        inline void _DecodeSOF(void) {
            int i, ssxmax = 0, ssymax = 0;
            Component* c;
            _DecodeLength();
            if (ctx.length < 9) JPEG_DECODER_THROW(SyntaxError);
            if (ctx.pos[0] != 8) JPEG_DECODER_THROW(Unsupported);
            ctx.height = _Decode16(ctx.pos+1);
            ctx.width = _Decode16(ctx.pos+3);
            ctx.ncomp = ctx.pos[5];
            _Skip(6);
            switch (ctx.ncomp) {
                case 1:
                case 3:
                    break;
                default:
                    JPEG_DECODER_THROW(Unsupported);
            }
            if (ctx.length < (ctx.ncomp * 3)) JPEG_DECODER_THROW(SyntaxError);
            for (i = 0, c = ctx.comp;  i < ctx.ncomp;  ++i, ++c) {
                c->cid = ctx.pos[0];
                if (!(c->ssx = ctx.pos[1] >> 4)) JPEG_DECODER_THROW(SyntaxError);
                if (c->ssx & (c->ssx - 1)) JPEG_DECODER_THROW(Unsupported);  // non-power of two
                if (!(c->ssy = ctx.pos[1] & 15)) JPEG_DECODER_THROW(SyntaxError);
                if (c->ssy & (c->ssy - 1)) JPEG_DECODER_THROW(Unsupported);  // non-power of two
                if ((c->qtsel = ctx.pos[2]) & 0xFC) JPEG_DECODER_THROW(SyntaxError);
                _Skip(3);
                ctx.qtused |= 1 << c->qtsel;
                if (c->ssx > ssxmax) ssxmax = c->ssx;
                if (c->ssy > ssymax) ssymax = c->ssy;
            }
            ctx.mbsizex = ssxmax << 3;
            ctx.mbsizey = ssymax << 3;
            ctx.mbwidth = (ctx.width + ctx.mbsizex - 1) / ctx.mbsizex;
            ctx.mbheight = (ctx.height + ctx.mbsizey - 1) / ctx.mbsizey;
            for (i = 0, c = ctx.comp;  i < ctx.ncomp;  ++i, ++c) {
                c->width = (ctx.width * c->ssx + ssxmax - 1) / ssxmax;
                c->stride = (c->width + 7) & 0x7FFFFFF8;
                c->height = (ctx.height * c->ssy + ssymax - 1) / ssymax;
                c->stride = ctx.mbwidth * ctx.mbsizex * c->ssx / ssxmax;
                if (((c->width < 3) && (c->ssx != ssxmax)) || ((c->height < 3) && (c->ssy != ssymax))) JPEG_DECODER_THROW(Unsupported);
                if (!(c->pixels = (unsigned char*)AllocMem(c->stride * (ctx.mbheight * ctx.mbsizey * c->ssy / ssymax)))) JPEG_DECODER_THROW(OutOfMemory);
            }
            if (ctx.ncomp == 3) {
                ctx.rgb = (unsigned char*)AllocMem(ctx.width * ctx.height * ctx.ncomp);
                if (!ctx.rgb) JPEG_DECODER_THROW(OutOfMemory);
            }
            _Skip(ctx.length);
        }

        inline void _DecodeDHT(void) {
            int codelen, currcnt, remain, spread, i, j;
            VlcCode *vlc;
            unsigned char counts[16];
            _DecodeLength();
            while (ctx.length >= 17) {
                i = ctx.pos[0];
                if (i & 0xEC) JPEG_DECODER_THROW(SyntaxError);
                if (i & 0x02) JPEG_DECODER_THROW(Unsupported);
                i = (i | (i >> 3)) & 3;  // combined DC/AC + tableid value
                for (codelen = 1;  codelen <= 16;  ++codelen)
                    counts[codelen - 1] = ctx.pos[codelen];
                _Skip(17);
                vlc = &ctx.vlctab[i][0];
                remain = spread = 65536;
                for (codelen = 1;  codelen <= 16;  ++codelen) {
                    spread >>= 1;
                    currcnt = counts[codelen - 1];
                    if (!currcnt) continue;
                    if (ctx.length < currcnt) JPEG_DECODER_THROW(SyntaxError);
                    remain -= currcnt << (16 - codelen);
                    if (remain < 0) JPEG_DECODER_THROW(SyntaxError);
                    for (i = 0;  i < currcnt;  ++i) {
                        register unsigned char code = ctx.pos[i];
                        for (j = spread;  j;  --j) {
                            vlc->bits = (unsigned char) codelen;
                            vlc->code = code;
                            ++vlc;
                        }
                    }
                    _Skip(currcnt);
                }
                while (remain--) {
                    vlc->bits = 0;
                    ++vlc;
                }
            }
            if (ctx.length) JPEG_DECODER_THROW(SyntaxError);
        }

        inline void _DecodeDQT(void) {
            int i;
            unsigned char *t;
            _DecodeLength();
            while (ctx.length >= 65) {
                i = ctx.pos[0];
                if (i & 0xFC) JPEG_DECODER_THROW(SyntaxError);
                ctx.qtavail |= 1 << i;
                t = &ctx.qtab[i][0];
                for (i = 0;  i < 64;  ++i)
                    t[i] = ctx.pos[i + 1];
                _Skip(65);
            }
            if (ctx.length) JPEG_DECODER_THROW(SyntaxError);
        }

        inline void _DecodeDRI(void) {
            _DecodeLength();
            if (ctx.length < 2) JPEG_DECODER_THROW(SyntaxError);
            ctx.rstinterval = _Decode16(ctx.pos);
            _Skip(ctx.length);
        }

        inline int _GetVLC(VlcCode* vlc, unsigned char* code) {
            int value = _ShowBits(16);
            int bits = vlc[value].bits;
            if (!bits) { ctx.error = SyntaxError; return 0; }
            _SkipBits(bits);
            value = vlc[value].code;
            if (code) *code = (unsigned char) value;
            bits = value & 15;
            if (!bits) return 0;
            value = _GetBits(bits);
            if (value < (1 << (bits - 1)))
                value += ((-1) << bits) + 1;
            return value;
        }

        inline void _DecodeBlock(Component* c, unsigned char* out) {
            unsigned char code;
            int value, coef = 0;
            memset(ctx.block, 0, sizeof(ctx.block));
            c->dcpred += _GetVLC(&ctx.vlctab[c->dctabsel][0], NULL);
            ctx.block[0] = (c->dcpred) * ctx.qtab[c->qtsel][0];
            do {
                value = _GetVLC(&ctx.vlctab[c->actabsel][0], &code);
                if (!code) break;  // EOB
                if (!(code & 0x0F) && (code != 0xF0)) JPEG_DECODER_THROW(SyntaxError);
                coef += (code >> 4) + 1;
                if (coef > 63) JPEG_DECODER_THROW(SyntaxError);
                ctx.block[(int) ZZ[coef]] = value * ctx.qtab[c->qtsel][coef];
            } while (coef < 63);
            for (coef = 0;  coef < 64;  coef += 8)
                _RowIDCT(&ctx.block[coef]);
            for (coef = 0;  coef < 8;  ++coef)
                _ColIDCT(&ctx.block[coef], &out[coef], c->stride);
        }

        inline void _DecodeScan(void) {
            int i, mbx, mby, sbx, sby;
            int rstcount = ctx.rstinterval, nextrst = 0;
            Component* c;
            _DecodeLength();
            if (ctx.length < (4 + 2 * ctx.ncomp)) JPEG_DECODER_THROW(SyntaxError);
            if (ctx.pos[0] != ctx.ncomp) JPEG_DECODER_THROW(Unsupported);
            _Skip(1);
            for (i = 0, c = ctx.comp;  i < ctx.ncomp;  ++i, ++c) {
                if (ctx.pos[0] != c->cid) JPEG_DECODER_THROW(SyntaxError);
                if (ctx.pos[1] & 0xEE) JPEG_DECODER_THROW(SyntaxError);
                c->dctabsel = ctx.pos[1] >> 4;
                c->actabsel = (ctx.pos[1] & 1) | 2;
                _Skip(2);
            }
            if (ctx.pos[0] || (ctx.pos[1] != 63) || ctx.pos[2]) JPEG_DECODER_THROW(Unsupported);
            _Skip(ctx.length);
            for (mby = 0;  mby < ctx.mbheight;  ++mby)
                for (mbx = 0;  mbx < ctx.mbwidth;  ++mbx) {
                    for (i = 0, c = ctx.comp;  i < ctx.ncomp;  ++i, ++c)
                        for (sby = 0;  sby < c->ssy;  ++sby)
                            for (sbx = 0;  sbx < c->ssx;  ++sbx) {
                                _DecodeBlock(c, &c->pixels[((mby * c->ssy + sby) * c->stride + mbx * c->ssx + sbx) << 3]);
                                if (ctx.error)
                                return;
                            }
                    if (ctx.rstinterval && !(--rstcount)) {
                        _ByteAlign();
                        i = _GetBits(16);
                        if (((i & 0xFFF8) != 0xFFD0) || ((i & 7) != nextrst)) JPEG_DECODER_THROW(SyntaxError);
                        nextrst = (nextrst + 1) & 7;
                        rstcount = ctx.rstinterval;
                        for (i = 0;  i < 3;  ++i)
                            ctx.comp[i].dcpred = 0;
                    }
                }
            ctx.error = Internal_Finished;
        }

        enum {
            CF4A = (-9),
            CF4B = (111),
            CF4C = (29),
            CF4D = (-3),
            CF3A = (28),
            CF3B = (109),
            CF3C = (-9),
            CF3X = (104),
            CF3Y = (27),
            CF3Z = (-3),
            CF2A = (139),
            CF2B = (-11),
        };

        inline unsigned char CF(const int x) {
            return _Clip((x + 64) >> 7);
        }

        inline void _UpsampleH(Component* c) {
            const int xmax = c->width - 3;
            unsigned char *out, *lin, *lout;
            int x, y;
            out = (unsigned char*)AllocMem((c->width * c->height) << 1);
            if (!out) JPEG_DECODER_THROW(OutOfMemory);
            lin = c->pixels;
            lout = out;
            for (y = c->height;  y;  --y) {
                lout[0] = CF(CF2A * lin[0] + CF2B * lin[1]);
                lout[1] = CF(CF3X * lin[0] + CF3Y * lin[1] + CF3Z * lin[2]);
                lout[2] = CF(CF3A * lin[0] + CF3B * lin[1] + CF3C * lin[2]);
                for (x = 0;  x < xmax;  ++x) {
                    lout[(x << 1) + 3] = CF(CF4A * lin[x] + CF4B * lin[x + 1] + CF4C * lin[x + 2] + CF4D * lin[x + 3]);
                    lout[(x << 1) + 4] = CF(CF4D * lin[x] + CF4C * lin[x + 1] + CF4B * lin[x + 2] + CF4A * lin[x + 3]);
                }
                lin += c->stride;
                lout += c->width << 1;
                lout[-3] = CF(CF3A * lin[-1] + CF3B * lin[-2] + CF3C * lin[-3]);
                lout[-2] = CF(CF3X * lin[-1] + CF3Y * lin[-2] + CF3Z * lin[-3]);
                lout[-1] = CF(CF2A * lin[-1] + CF2B * lin[-2]);
            }
            c->width <<= 1;
            c->stride = c->width;
            FreeMem(c->pixels);
            c->pixels = out;
        }

        inline void _UpsampleV(Component* c) {
            const int w = c->width, s1 = c->stride, s2 = s1 + s1;
            unsigned char *out, *cin, *cout;
            int x, y;
            out = (unsigned char*)AllocMem((c->width * c->height) << 1);
            if (!out) JPEG_DECODER_THROW(OutOfMemory);
            for (x = 0;  x < w;  ++x) {
                cin = &c->pixels[x];
                cout = &out[x];
                *cout = CF(CF2A * cin[0] + CF2B * cin[s1]);  cout += w;
                *cout = CF(CF3X * cin[0] + CF3Y * cin[s1] + CF3Z * cin[s2]);  cout += w;
                *cout = CF(CF3A * cin[0] + CF3B * cin[s1] + CF3C * cin[s2]);  cout += w;
                cin += s1;
                for (y = c->height - 3;  y;  --y) {
                    *cout = CF(CF4A * cin[-s1] + CF4B * cin[0] + CF4C * cin[s1] + CF4D * cin[s2]);  cout += w;
                    *cout = CF(CF4D * cin[-s1] + CF4C * cin[0] + CF4B * cin[s1] + CF4A * cin[s2]);  cout += w;
                    cin += s1;
                }
                cin += s1;
                *cout = CF(CF3A * cin[0] + CF3B * cin[-s1] + CF3C * cin[-s2]);  cout += w;
                *cout = CF(CF3X * cin[0] + CF3Y * cin[-s1] + CF3Z * cin[-s2]);  cout += w;
                *cout = CF(CF2A * cin[0] + CF2B * cin[-s1]);
            }
            c->height <<= 1;
            c->stride = c->width;
            FreeMem(c->pixels);
            c->pixels = out;
        }

        inline void _Convert() {
            int i;
            Component* c;
            for (i = 0, c = ctx.comp;  i < ctx.ncomp;  ++i, ++c) {
                while ((c->width < ctx.width) || (c->height < ctx.height)) {
                    if (c->width < ctx.width) _UpsampleH(c);
                    if (ctx.error) return;
                    if (c->height < ctx.height) _UpsampleV(c);
                    if (ctx.error) return;
                }
                if ((c->width < ctx.width) || (c->height < ctx.height)) JPEG_DECODER_THROW(InternalError);
            }
            if (ctx.ncomp == 3) {
                // convert to RGB
                int x, yy;
                unsigned char *prgb = ctx.rgb;
                const unsigned char *py  = ctx.comp[0].pixels;
                const unsigned char *pcb = ctx.comp[1].pixels;
                const unsigned char *pcr = ctx.comp[2].pixels;
                for (yy = ctx.height;  yy;  --yy) {
                    for (x = 0;  x < ctx.width;  ++x) {
                        register int y = py[x] << 8;
                        register int cb = pcb[x] - 128;
                        register int cr = pcr[x] - 128;
                        *prgb++ = _Clip((y            + 359 * cr + 128) >> 8);
                        *prgb++ = _Clip((y -  88 * cb - 183 * cr + 128) >> 8);
                        *prgb++ = _Clip((y + 454 * cb            + 128) >> 8);
                    }
                    py += ctx.comp[0].stride;
                    pcb += ctx.comp[1].stride;
                    pcr += ctx.comp[2].stride;
                }
            } else if (ctx.comp[0].width != ctx.comp[0].stride) {
                // grayscale -> only remove stride
                unsigned char *pin = &ctx.comp[0].pixels[ctx.comp[0].stride];
                unsigned char *pout = &ctx.comp[0].pixels[ctx.comp[0].width];
                int y;
                for (y = ctx.comp[0].height - 1;  y;  --y) {
                    memcpy(pout, pin, ctx.comp[0].width);
                    pin += ctx.comp[0].stride;
                    pout += ctx.comp[0].width;
                }
                ctx.comp[0].stride = ctx.comp[0].width;
            }
        }

        DecodeResult _Decode(const unsigned char* jpeg, const int size) {
            ctx.pos = (const unsigned char*) jpeg;
            ctx.size = size & 0x7FFFFFFF;
            if (ctx.size < 2) return NotAJpeg;
            if ((ctx.pos[0] ^ 0xFF) | (ctx.pos[1] ^ 0xD8)) return NotAJpeg;
            _Skip(2);
            while (!ctx.error) {
                if ((ctx.size < 2) || (ctx.pos[0] != 0xFF)) return SyntaxError;
                _Skip(2);
                switch (ctx.pos[-1]) {
                    case 0xC0: _DecodeSOF();  break;
                    case 0xC4: _DecodeDHT();  break;
                    case 0xDB: _DecodeDQT();  break;
                    case 0xDD: _DecodeDRI();  break;
                    case 0xDA: _DecodeScan(); break;
                    case 0xFE: _SkipMarker(); break;
                    default:
                        if ((ctx.pos[-1] & 0xF0) == 0xE0)
                            _SkipMarker();
                        else
                            return Unsupported;
                }
            }
            if (ctx.error != Internal_Finished) return ctx.error;
            ctx.error = OK;
            _Convert();
            return ctx.error;
        }
    };


inline Decoder::Decoder(const unsigned char* data, size_t size, void *(*allocFunc)(size_t), void (*freeFunc)(void*))
    : AllocMem(allocFunc)
    , FreeMem(freeFunc)
{
    // should be static data, but this keeps us as a header
    char temp[64] = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18,
        11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35,
        42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45,
        38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 };
    memcpy(ZZ, temp, sizeof(ZZ));
    memset(&ctx, 0, sizeof(Context));
    _Decode(data, size);
}

inline Decoder::DecodeResult Decoder::GetResult() const { return ctx.error; }
inline int Decoder::GetWidth() const { return ctx.width; }
inline int Decoder::GetHeight() const { return ctx.height; }
inline bool Decoder::IsColor() const { return ctx.ncomp != 1; }
inline unsigned char* Decoder::GetImage() const { return (ctx.ncomp == 1) ? ctx.comp[0].pixels : ctx.rgb; }
inline size_t Decoder::GetImageSize(void) const { return ctx.width * ctx.height * ctx.ncomp; }

inline Decoder::~Decoder()
{
    int i;
    for (i = 0;  i < 3;  ++i)
        if (ctx.comp[i].pixels) FreeMem((void*) ctx.comp[i].pixels);
    if (ctx.rgb) FreeMem((void*) ctx.rgb);
}

}
#ifdef _MSC_VER
    #pragma warning(pop)
#endif

#undef JPEG_DECODER_THROW

#endif

