Character name hash function (was Re: Unique character names)

From: Mike (mike-list@pobox.com)
Date: Wed Dec 19 2007 - 23:20:14 CST

  • Next message: William J Poser: "Re: CLDR Usage of Gregorian Calendar Era Terms: BC and AD -- Can we please have "CE" and "BCE" ?"

    > (... I'm playing around with
    > algorithms for efficient storage of character names.)

    I've built a Unicode character name hash function that
    takes as input a character name and returns a 32-bit
    unsigned integer hash of the name. A unique value is
    returned for every character name as of version 5.0,
    and also produces no conflicts with named sequences.
    It is case-insensitive and treats hyphens, underscores,
    and spaces as equivalent (except for the special cases
    noted).

    This can be used to look up a character given its name,
    where the hash of the name is used as the key and the
    code point is the value; if you are going the other way,
    then I'm sorry for mis-interpreting what you're looking
    for.

    The C source code is attached -- I'll place it in the
    public domain. There is of course no warranty.

    Mike


    /* CharNameHash takes as input the name of a Unicode character
       and returns a 32-bit unsigned integer hash of the name which
       is unique among all characters in Unicode version 5.0. There
       are also no conflicts when using this function to hash the
       names of named character sequences.

       CharNameHash also recognizes the names of control characters
       such as "control A", "control-[", or "^C".

       Note that the variable names use a sort of Hungarian notation
       which makes it easier for me to understand code I've written
       weeks or months ago. A quick reference:

          c char
          u unsigned integer (any size)
          pz pointer to zero-terminated string (e.g. const char*)
    */

    #include <ctype.h>

    /* you may need to define:
    typedef unsigned int uint32;
    typedef unsigned char uchar;
    */

    uint32 CharNameHash (const char* pz_CharacterName)
    {
    #define u_StateStart 0
    #define u_StateSpace 1
    #define u_StateHyphen 2
    #define u_StateO 3
    #define u_StateOHyphen 4
    #define u_StateAE 5
    #define u_StateCaret 6
    #define u_StateL 7
    #define u_StateLSpace 8
    #define u_StateLUnder 9

        uint32 u_Hash = 0;
        uint32 u_State = u_StateStart;
        uint32 u_Base = 0x20;

        /* space, underscore, and hyphen are ignored in general, but we need
        / to distinguish the following characters:
        /
        / tibetan letter a (U+0F68)
        / tibetan letter -a (U+0F60)
        /
        / tibetan subjoined letter a (U+0FB8)
        / tibetan subjoined letter -a (U+0FB0)
        /
        / hangul jungseong oe (U+116C)
        / hangul jungseong o-e (U+1180)
        /
        / so the hash is incremented if we see either a space or underscore
        / followed by "-a", or if we see a space or underscore followed by
        / "o-e" (for the korean character)
        /
        / we also need to recognize the following character names:
        /
        / control _
        / ^_
        /
        / to represent code point U+001F, so if we see either ^ or L (L can
        / be followed by an optional space) and an underscore, we treat the
        / underscore as a regular character
        */

        if (pz_CharacterName)
        {
            for (;;)
            {
                char c = *pz_CharacterName++;

                switch (c)
                {
                    case '\0':
                    {
                        if (u_State == u_StateAE)
                        {
                            ++u_Hash;
                        }
                        else if (u_State == u_StateLUnder)
                        {
                            c = '_' - u_Base;

                            u_Hash += u_Hash << 6;
                            u_Hash ^= (uchar) c;
                        }

                        return u_Hash;
                    }

                    case ' ':
                    {
                        switch (u_State)
                        {
                            case u_StateAE:
                            {
                                ++u_Hash;

                                u_State = u_StateSpace;

                                break;
                            }

                            case u_StateCaret:
                            {
                                break;
                            }

                            case u_StateL:
                            case u_StateLSpace:
                            {
                                u_State = u_StateLSpace;

                                break;
                            }

                            case u_StateLUnder:
                            {
                                c = '_' - u_Base;

                                u_Hash += u_Hash << 6;
                                u_Hash ^= (uchar) c;

                                u_State = u_StateSpace;

                                break;
                            }

                            default:
                            {
                                u_State = u_StateSpace;

                                break;
                            }
                        }

                        break;
                    }

                    case '_':
                    {
                        switch (u_State)
                        {
                            case u_StateAE:
                            {
                                ++u_Hash;

                                u_State = u_StateSpace;

                                break;
                            }

                            case u_StateSpace:
                            {
                                u_State = u_StateHyphen;

                                break;
                            }

                            case u_StateO:
                            {
                                u_State = u_StateOHyphen;
                                
                                break;
                            }

                            case u_StateCaret:
                            {
                                c = '_' - u_Base;

                                u_Hash += u_Hash << 6;
                                u_Hash ^= (uchar) c;

                                u_State = u_StateStart;

                                break;
                            }

                            case u_StateL:
                            case u_StateLSpace:
                            case u_StateLUnder:
                            {
                                u_State = u_StateLUnder;

                                break;
                            }

                            default:
                            {
                                u_State = u_StateSpace;

                                break;
                            }
                        }

                        break;
                    }

                    case '-':
                    {
                        switch (u_State)
                        {
                            case u_StateAE:
                            {
                                ++u_Hash;

                                u_State = u_StateSpace;

                                break;
                            }

                            case u_StateSpace:
                            {
                                u_State = u_StateHyphen;

                                break;
                            }

                            case u_StateO:
                            {
                                u_State = u_StateOHyphen;
                                
                                break;
                            }

                            case u_StateL:
                            {
                                u_State = u_StateLSpace;

                                break;
                            }

                            default:
                            {
                                u_State = u_StateSpace;
                                
                                break;
                            }
                        }

                        break;
                    }

                    case '^':
                    {
                        c = '^' - u_Base;

                        u_Hash += u_Hash << 6;
                        u_Hash ^= (uchar) c;

                        if (u_State == u_StateCaret)
                        {
                            u_State = u_StateStart;
                        }
                        else
                        {
                            u_State = u_StateCaret;
                        }

                        break;
                    }

                    case 'a':
                    case 'A':
                    {
                        c = 'A' - u_Base;

                        u_Hash += u_Hash << 6;
                        u_Hash ^= (uchar) c;

                        if (u_State == u_StateHyphen)
                        {
                            u_State = u_StateAE;
                        }
                        else
                        {
                            u_State = u_StateStart;
                        }

                        break;
                    }

                    case 'o':
                    case 'O':
                    {
                        c = 'O' - u_Base;

                        u_Hash += u_Hash << 6;
                        u_Hash ^= (uchar) c;

                        if (u_State == u_StateSpace)
                        {
                            u_State = u_StateO;
                        }
                        else
                        {
                            u_State = u_StateStart;
                        }

                        break;
                    }

                    case 'e':
                    case 'E':
                    {
                        c = 'E' - u_Base;

                        u_Hash += u_Hash << 6;
                        u_Hash ^= (uchar) c;

                        if (u_State == u_StateOHyphen)
                        {
                            u_State = u_StateAE;
                        }
                        else
                        {
                            u_State = u_StateStart;
                        }

                        break;
                    }

                    case 'l':
                    case 'L':
                    {
                        c = 'L' - u_Base;

                        u_Hash += u_Hash << 6;
                        u_Hash ^= (uchar) c;

                        u_State = u_StateL;

                        break;
                    }

                    default:
                    {
                        c = (char) (toupper (c) - u_Base);

                        u_Hash += u_Hash << 6;
                        u_Hash ^= (uchar) c;

                        u_State = u_StateStart;

                        break;
                    }
                }
            }
        }

        return u_Hash;
    }



    This archive was generated by hypermail 2.1.5 : Wed Dec 19 2007 - 23:53:05 CST