28/12/2009

Optimisation: Getting it right... eventually

I keep forgetting how best to find the integer part of the logarithm base 2 of an integer - there are naive methods which are usually good enough:
    __forceinline UInteger32 IntLog2Naive() const
    {
        UInteger32 xReturn = 0;
        unsigned int uTest = m_uData >> 1;
        while( uTest )
        {
            uTest >>= 1;
            ++xReturn;
        }

        return xReturn;
    }

    __forceinline UInteger32 IntLog2Naive2() const
    {
        for( unsigned int u = 31; u != 0; --u )
        {
            if( m_uData & ( 1 << u ) )
            {
                return u;
            }
        }

        return 0;
    }
There is also a binary search method:
    __forceinline UInteger32 IntLog2BinarySearch() const
    {
        UInteger32 xReturn = 0;
        unsigned int uCopy = m_uData;

        if( uCopy & 0xffff0000 )
        {
            xReturn += 16;
            uCopy >>= 16;
        }

        if( uCopy & 0xff00 )
        {
            xReturn += 8;
            uCopy >>= 8;
        }

        if( uCopy & 0xf0 )
        {
            xReturn += 4;
            uCopy >>= 4;
        }

        if( uCopy & 0xc )
        {
            xReturn += 2;
            uCopy >>= 2;
        }

        if( uCopy & 0x2 )
        {
            xReturn += 1;
        }

        return xReturn;
    }
With floating point operations we can also do this:
    __forceinline UInteger32 IntLog2FloatCast() const
    {
        double dData = static_cast< double >( m_uData );

        // fpu control word must be set to round down
        // and for double precision
        __asm
        {
            fld1
            fld qword ptr [ dData ]
            fyl2x
            frndint
            fst qword ptr [ dData ]
        }

        return UInteger32( static_cast< unsigned int >( dData ) );
    }
However, the correct solution for my target platform is none of the above, but is instead this, using an opcode which I stumbled upon in the Intel manuals:
    __forceinline UInteger32 IntLog2() const
    {
        __asm
        {
            mov ecx, dword ptr[ this ]
            bsr eax, dword ptr[ ecx ]
        }
    }
This is pretty much the same as the first result on Google as well... so I really should have looked before solving the problem myself.

EDIT: I should probably point out that the first 3 compile out when the value is constant, but the latter two do not - although the last one is the fastest for values which are not known at compile time.

24/12/2009

It made Wikipedia....

http://news.bbc.co.uk/1/hi/uk/8425718.stm

I ditched my car and walked when I got within a mile of home - collected it the next morning. :)

EDIT: whoops, wrong link: http://en.wikipedia.org/wiki/December_2009_European_snowfall

Christmas Coding Tips

Here is my Christmas present to you this year, some of my favorite coding tips:

Use plenty of whitespace. Do you prefer tabs or spaces? Who cares, just make sure you use plenty of whitespace, especially newlines in your code. The motivation is making the code easier to read. Consider the following example from Fridge:
void Panel::Render()
{
    if( !m_bVisible )
    {
        return;
    }

    if( m_xImage != INVALID_STRING_HASH )
    {
        int iTextureID = TextureManager::GetTextureID( m_xImage );
        
        glColor4f( 1.0f, 1.0f, 1.0f, 1.0f );
        glBindTexture( GL_TEXTURE_2D, iTextureID );
    }
    else
    {
        glBindTexture( GL_TEXTURE_2D, INVALID_TEXTURE_ID );
        
        u_int uSwapped = MathsHelper::RGBAtoABGR( m_uColour );

        glColor4ubv( reinterpret_cast<GLubyte*>( &uSwapped ) );
    }

    glBegin( GL_QUADS );

    glTexCoord2f( 0.0f, 0.0f );
    glVertex2i( GetResizedX(), GetResizedY() );
    glTexCoord2f( 0.0f, 1.0f );
    glVertex2i( GetResizedX(), GetResizedY() + GetResizedHeight() );
    glTexCoord2f( 1.0f, 1.0f );
    glVertex2i( GetResizedX() + GetResizedWidth(), GetResizedY() + GetResizedHeight() );
    glTexCoord2f( 1.0f, 0.0f );
    glVertex2i( GetResizedX() + GetResizedWidth(), GetResizedY() );

    glEnd();

    if( m_bBorder )
    {
        const float fPositionX = static_cast< float >( GetResizedX() );
        const float fPositionY = static_cast< float >( GetResizedY() );
        const float fWidth = static_cast< float >( GetResizedWidth() );
        const float fHeight = static_cast< float >( GetResizedHeight() );

        glBindTexture( GL_TEXTURE_2D, INVALID_TEXTURE_ID );

        u_int uSwapped = MathsHelper::RGBAtoABGR( uPANEL_BORDER_COLOUR );

        glColor4ubv( reinterpret_cast<GLubyte*>( &uSwapped ) );

        glBegin( GL_LINE_LOOP );

        // Top
        // glVertex2f( fPositionX, fPositionY );
        glVertex2f( fPositionX + fWidth, fPositionY );

        // Right
        // glVertex2f( fPositionX + fWidth, fPositionY );
        glVertex2f( fPositionX + fWidth, fPositionY + fHeight );

        // Bottom
        // glVertex2f( fPositionX + fWidth , fPositionY + fHeight );
        glVertex2f( fPositionX, fPositionY + fHeight );

        // Left
        // glVertex2f( fPositionX, fPositionY + fHeight );
        glVertex2f( fPositionX, fPositionY );


        glEnd( );

    }

    Widget::Render();
}
And again with more spartan use of whitespace:
    if(!m_bVisible) return;
    if(m_xImage!=INVALID_STRING_HASH)
    {
        int iTextureID=TextureManager::GetTextureID(m_xImage);
        glColor4f(1.0f,1.0f,1.0f,1.0f);
        glBindTexture(GL_TEXTURE_2D,iTextureID);
    }
    else
    {
        glBindTexture(GL_TEXTURE_2D,INVALID_TEXTURE_ID);
        u_int uSwapped=MathsHelper::RGBAtoABGR(m_uColour);
        glColor4ubv(reinterpret_cast<GLubyte*>(&uSwapped));
    }
    glBegin(GL_QUADS);
    glTexCoord2f(0.0f,0.0f);
    glVertex2i(GetResizedX(),GetResizedY());
    glTexCoord2f(0.0f,1.0f);
    glVertex2i(GetResizedX(),GetResizedY()+GetResizedHeight());
    glTexCoord2f(1.0f,1.0f);
    glVertex2i(GetResizedX()+GetResizedWidth(),GetResizedY()+GetResizedHeight());
    glTexCoord2f(1.0f,0.0f);
    glVertex2i(GetResizedX()+GetResizedWidth(),GetResizedY());
    glEnd();
    if(m_bBorder)
    {
        const float fPositionX=static_cast<float>(GetResizedX());
        const float fPositionY=static_cast<float>(GetResizedY());
        const float fWidth=static_cast<float>(GetResizedWidth());
        const float fHeight=static_cast<float>(GetResizedHeight());
        glBindTexture(GL_TEXTURE_2D,INVALID_TEXTURE_ID);
        u_int uSwapped=MathsHelper::RGBAtoABGR(uPANEL_BORDER_COLOUR);
        glColor4ubv(reinterpret_cast<GLubyte*>(&uSwapped));
        glBegin(GL_LINE_LOOP);
        // Top
        // glVertex2f(fPositionX,fPositionY);
        glVertex2f(fPositionX+fWidth,fPositionY);
        // Right
        // glVertex2f(fPositionX+fWidth,fPositionY);
        glVertex2f(fPositionX+fWidth,fPositionY+fHeight);
        // Bottom
        // glVertex2f(fPositionX+fWidth ,fPositionY+fHeight);
        glVertex2f(fPositionX,fPositionY+fHeight);
        // Left
        // glVertex2f(fPositionX,fPositionY+fHeight);
        glVertex2f(fPositionX,fPositionY);
        glEnd();
    }
    Widget::Render();
In C++ this extends to using scope wherever possible, single line if or while statements can be difficult to debug in MSVS and using scope inside of switch statements is useful since you gain the ability to declare variables local only to that scope. Without spaces and scope the code becomes a blob of characters which is inherently difficult to read - even with comments, good variable names, Hungarian notation and efficient code.

Check your pointers. If you are using C, C++ or some other language with raw memory access and pointers then initialise them to some value (0x00000000 and 0xCCCCCCCC are popular) and make sure they have been set to something else before dereferencing them. Even though I try to be religious about this I still get stung with an uninitialised pointer to debug every once in a while, and unless it breaks things horribly it can be subtle and difficult to trace. Then again, it hurts to check them mindlessly - a silent, well handled pointer failure can hide problems if some rarely used feature breaks. My usual solutions are to report such failures via assertions, returning an error code from the function or presenting an error message to the user. If you care that much about the speed you can lose by doing all of this, you should have a seperate build configuration which will compile out any pointer checks that should be redundant along with the assert messages and the like.

Look at the assembler code. You should probably step through it too if you are getting confusing results when stepping through the source code.This one applies especially to C++ and optimisation, because its difficult to guess whether something will optimise away or not - the only way to be sure is to inspect the assembler code, it might also reveal more about the effects of compiler options, inlining and other features which are otherwise invisible when only looking at the source code - if you have inline assembler statements then you can also make sure that they are integrated with the surrounding code correctly, or are being compiled the way you want them to be. MSVS helps a lot here, interleaving your C++ code segments or inline assembler instructions with the compiled code, making it easier to interpret.

Practice. Self explanatory really.

Merry Christmas! :)

13/12/2009

More maths

I stumbled on something quite surprising recently whilst reading up on primality tests - there is known to be a certain polynomial (see here) whose positive integer values is precisely the set of primes. This polynomial is implied to exist from an interesting property relating Diophantine equations and countable sets. Its quite a monstrous thing, with order 25 and 26 variables - just enough that we can write it without running out of letters.


Each combination of the 26 variables (which must be positive and non-zero) produces either a prime or a negative number.

Can turn this polynomial into a primality test? Unfortunately solving Diophantine equations is a very hard problem, and it has been proven that no general algorithm exists (Hilbert's tenth problem). Although there is nothing to say that this particular one can not be solved.

Something I find interesting is that this polynomial can be reduced to a function of a single variable, although I haven't been able to find such a formulation, and the mathematical notation for it would be horrible, implementing a C++ function is possible. This feels like a step a closer to a function which maps n to the nth prime.

The problem is reduced to evaluating this polynomial using values from a second function (or 26 extra functions) which convert an integer into 26 values mapping the set of positive non-zero integers precisely to the set of permutations of 26 positive non-zero integers. But do such functions exist? Well, the case for two values without the restriction of being positive or non-zero is especially easy to solve geometrically, observing that a sort of square spiral maps a one dimensional value (distance along the spiral from the centre) to a sequence of unique permutations of 2 integers.

Extending this to three or more dimensions is also possible - here is a hand waving derivation/proof by induction: The spiral can be seen as some concentric squares, and in three dimensions these can be replaced with cubes, then we can iterate across the integral points on the surfaces of the concentric cubes, starting from the origin, then using a 2x2x2 cube, a 4x4x4 cube and so on - which can be reduced to the two dimensional case being applied to each of the six faces of each of the cubes (for induction, in practice we would just nest two "for" loops). We can extend this to hypercubes, since they have cubic 'faces', and so on up through the dimensions until we reach 26.

Now, what about removing the zeros and the negatives? We can map the negative numbers n to |2n| and the positives and zero to 2n+1. It is possible to directly produce a sequence of permutations which is already positive and non-zero, however I couldn't find a way to simplify the logic as nicely as by using this method. e.g. We could start from a different "spiral" and use a set of overlapping cubes, hypercubes etc.

I should probably go and write this code now, and see how useful it really is... but at first glance it looks like this might be a practical method for generating enormous primes complete with a proof of their primality and a step towards mapping n to the nth prime. We just have to plug huge values of x in to the function until the result is positive. The proof comes from the polynomial, and the fact that the only positive values which can be represented with this polynomial when all of the variables are positive and non-zero are primes.

13/11/2009

Eratosthenes' sieve

I've recently been working on various side projects to help hone my programming skills. One previous project that I took way too far and enjoyed working on was a set of tools for factorising numbers and testing for primality, so I decided to have another pop... unfortunately I've long since lost the source code.

One of the first things I decided to recreate was a modified version of Eratothenes' sieve I had written before, which has similar run-time complexity to the original algorithm, but requiring much less memory, roughly on the order of n/ln(n), or more accurately the number of primes below n and a constant, where n is the amount of numbers tested.

Eratosthenes sieve uses relatively simple techniques to find all prime numbers below n without requiring the expensive calculation of many remainders from divisions (mod, or % in C like languages). In doing so it simultaneously tests a large batch of numbers for primality significantly faster than testing the individual numbers, even with modern primality tests, with a run-time on the order of n(logn)(loglogn) and memory usage proportional to n where n is the amount of numbers being tested. An outline of the original algorithm can be found on Wikipedia.

Despite my love of reinventing the wheel, I couldn't really be bothered to re-write it, especially since I had forgotten much of the logic underlying my modification of the algorithm, so I thought I'd look on the internet to try to find source code or pseudocode to help make the process easier on my brain.

The first source code I found was correct, but unfortunately it missed what makes it so fast. Wikipedia was not much help either, and I found much of the discussion on the page to be somewhat pointless. It discusses "Turner's sieve" which in fact is just a functional language implementation of the sieve, which makes the same mistake as the first source code I found. It also mentions that someone recently proved this to be slower than other implementations in imperative languages. Both of these things hardly seems worth mentioning or even attributing credit for since they are quite obvious from the nature of the algorithm. I looked at another page, and although the implementation was better it still didn't seem particularly good, using multiplication where it was not necessary and incrementing a value at the same time.

There are two more I looked at, both with similar flaws. I don't think they really deserve the high ranking they have on Google. But, is that a realistic expectation? I'm probably boosting their rankings by some small amount by linking them from here. It seems that there is more and more poor quality information available through Google, Wikipedia and the rest of the internet and as much as the internet is a great tool for learning, there is also a tremendous wealth of bad information out there to be careful of. These pages do deliver what they promise, but it would have been nicer to see some much better implementations - after all the algorithm is extremely simple.

So here is my implementation, its not perfect and can certainly be optimised further, but it does the job. The most important feature is that it retains a large amount of the speed despite being quite long winded, and the speed it does lose is to reduce the memory overhead. The mod operation is used only once for every prime removed from the list during each pass of the array used to store numbers that need to be tested, the rest is all fast bitwise operations and additions. It can easily be modified to allow for larger numbers by replacing the unsigned ints with a suitable type or class. It happily generates the first 10,000,000 primes in about three and a half seconds.
#include <math.h>
#include <stdio.h>

static const unsigned int uWORKING_ARRAY_SIZE = 1024;

void EratosthenesSieve( unsigned int uCount, unsigned int* puOutput, const bool bOutput )
{
    if( !puOutput || uCount == 0 )
    {
        return;
    }
    
    for( unsigned int u = 1; u < uCount; ++u )
    {
        puOutput[ u ] = 0;
    }
    puOutput[ 0 ] = 2;

    unsigned int auWorkingArray[ uWORKING_ARRAY_SIZE ];
    unsigned int uPrimesFound = 1;
    unsigned int uStartPosition = 3;

    // do a first pass to find the primes up to uWORKING_ARRAY_SIZE
    auWorkingArray[ 0 ] = uStartPosition;
    for( unsigned int u = 1; u < uWORKING_ARRAY_SIZE; ++u )
    {
        auWorkingArray[ u ] = auWorkingArray[ u - 1 ] + 2;
    }

    unsigned int uPos = 0;
    while( uPos < uWORKING_ARRAY_SIZE && uPrimesFound < uCount )
    {
        const unsigned int uTestPrime = auWorkingArray[ uPos ];
        puOutput[ uPrimesFound ] = uTestPrime;
        ++uPrimesFound;
        // only zero out entries from the square of the prime onwards
        // all other entries should have already been removed
        for( unsigned int u = ( uTestPrime * uTestPrime >> 1 ) - 1; u < uWORKING_ARRAY_SIZE; u += uTestPrime )
        {
            auWorkingArray[ u ] = 0;
        }
        
        ++uPos;
        while( auWorkingArray[ uPos ] == 0 && uPos < uWORKING_ARRAY_SIZE )
        {
            ++uPos;
        }
    }

    uStartPosition += uWORKING_ARRAY_SIZE << 1;

    // now find primes until we reach uCount, if we haven't already
    while( uPrimesFound < uCount )
    {
        if( bOutput )
        {
            printf( "Found %d primes so far...\r\n", uPrimesFound );
        }
        // fill out the working array, ignoring multiples of 2
        auWorkingArray[ 0 ] = uStartPosition;
        for( unsigned int u = 1; u < uWORKING_ARRAY_SIZE; ++u )
        {
            auWorkingArray[ u ] = auWorkingArray[ u - 1 ] + 2;
        }

        // 

        // remove multiples of any already found primes, except 2
        unsigned int uUpperPrimeLimit = static_cast< unsigned int >( ceil( sqrt( static_cast< double >( auWorkingArray[ uWORKING_ARRAY_SIZE - 1 ] ) ) ) );
        const unsigned int uOriginalFirstWorkingValue = auWorkingArray[ 0 ];
        for( unsigned int u = 1; u < uPrimesFound, puOutput[ u ] < uUpperPrimeLimit; ++u )
        {
            const unsigned int uTestPrime = puOutput[ u ];
            const unsigned int uStartMod = uOriginalFirstWorkingValue % uTestPrime;
            unsigned int uPos;
            if( uStartMod == 0 )
            {
                uPos = 0;
            }
            else if( uStartMod & 0x1 )
            {
                uPos = ( uTestPrime >> 1 ) - ( uStartMod >> 1 );
            }
            else
            {
                uPos = uTestPrime - ( uStartMod >> 1 );
            }
            
            while( uPos < uWORKING_ARRAY_SIZE )
            {
                auWorkingArray[ uPos ] = 0;
                uPos += uTestPrime;
            }
        }

        // step through numbers until picking out the non-zero values, which are prime
        unsigned int uPos = 0;
        while( uPos < uWORKING_ARRAY_SIZE )
        {
            while( auWorkingArray[ uPos ] == 0 )
            {
                ++uPos;
            }

            if( uPos >= uWORKING_ARRAY_SIZE )
            {
                break;
            }

            puOutput[ uPrimesFound ] = auWorkingArray[ uPos ];
            ++uPrimesFound;

            if( uPrimesFound == uCount )
            {
                break;
            }

            ++uPos;
        }

        uStartPosition += uWORKING_ARRAY_SIZE << 1;
    }
}

24/10/2009

Yet another car...


Yup, I traded in the 323f for this pretty thing... a Daewoo Nubira 1.6 SE.