海量数据处理：一亿个浮点数的排序算法

有1亿个浮点数，请找出其中最小的10000个。提示：假设每个浮点数占4个字节，1亿个浮点数就要站到相当大的空间，因此不能一次将全部读入内存进行排序。
问题分析：
1) 1亿个浮点数，其数据大小为 400 M。如此规模的排序，首先想到分批处理。每次读取 1 000 000 个数据并进行快速排序。需要的内存空间为 1 000 000 * 4 = 4M。需要100 次这样的排序。

2）完全没的规律的数据，考虑使用快速排序。快速排序的平均复杂度是 O( Nlog(N) )。我们可以直接使用 stl 提供的全局函数 sort() ，它使用了快速排序算法（实际是三平均分区法　median-of-three　）。
3) 最后只要最大的 10000 个。则每个批次只需要保留排序结果的前 10000 个数据。这段数据已经是分段有序的。数据量为 10000 * 100。
解法：
1) 数据结构定义：
定义数据规模：

enum
{
    batchCapacity = 1000000 ,
    batchCount     = 100 ,
    resultCount     = 10000
}

２）生成数据样本：

void dataPrepare( const char * filName  )
{
     float * pbuf;
     if ( ( pbuf = (   float * )malloc( batchCapacity   * sizeof ( float ) )) == NULL )
    {
         throw ( " failed to malloc " );
    }
     //   生成 batchCapacity * batchCount 个随机实数，并保存到文件
    ofstream fout;
    fout.exceptions(std::ios::badbit | std::ios::failbit | std::ios::eofbit );
    fout.open( filName, ios::binary ) ;
     if ( ! fout   )
    {
         throw ( " file not exits " );
    }
     for ( size_t index = 0 ; index < batchCount; index ++ )
    {
         for ( size_t index = 0 ; index < batchCapacity; index ++   )
        {
            pbuf[index] = RandomFloat( 0 , 65537 );
        }
        fout.write( ( char * )pbuf, batchCapacity * sizeof ( float )  );
    }
    fout.close();
    delete pbuf;
    pbuf = NULL;
     return ;
}

以上，用 RandomFloat() 生成随机数。其定义如下：

/* *******************************************
* Rand::rand 线性同余算法获得随机数
* 会循环出现相同的数。有待改进
*
******************************************** */

#include < cstdlib >
#include < ctime >
class Rand
{
public :
     static long long r;
     static int rand() // 产生随机数
    {
       // 三个参数的取值　关键字：辗转相除二次同余
        r = ( r * 1010557   + 79390691 ) % 100663363 ;
         return r;
    }
};
long long Rand::r = 43215 ;

float RandomFloat( float low, float high) {
     float d = float ( Rand::rand()) / ( float (RAND_MAX) + 1 );
         return low + d * (high - low);
}

3 ) 排序

void dataOrder( const char * filName  )
{
     float * pbuf   = (   float * )malloc( batchCapacity   * sizeof ( float ) );
     if ( pbuf == NULL )
    {
         throw ( " failed to malloc " );
    }
    ifstream fin;
    ofstream fout;
    fin.exceptions(std::ios::badbit | std::ios::failbit | std::ios::eofbit );
    fout.exceptions(std::ios::badbit | std::ios::failbit | std::ios::eofbit );
    fin.open( filName, ios::binary );
    fout.open( string ( string (filName).append( " .order " ) ).c_str(), ios::binary );
     for ( size_t index = 0 ;index < batchCount;index ++ )
    {
         // 　分批读入，排序
        fin.read( ( char * )pbuf, batchCapacity * sizeof ( float )  );
        std::sort( pbuf, pbuf + batchCapacity );
        fout.write( ( char * )pbuf,  resultCount * sizeof ( float ) );
        cout <<    " writed bytes: "    << resultCount * index + 1 <<   endl;
    }
    fin.close();
    fout.close();
    delete pbuf;
     // 将分组的数据综合排序
    pbuf =   (   float * )malloc( resultCount * batchCount * sizeof ( float ) );
     if ( pbuf == NULL )
    {
         throw ( " failed to malloc " );
    }
    fin.open( string ( string (filName).append( " .order " ) ).c_str(), ios::binary );
    fin.read( ( char * )pbuf, resultCount * batchCount * sizeof ( float )  );
    std::sort( pbuf, pbuf +   resultCount * batchCount  );

     // merge_sort<float>(  pbuf,0,( resultCount *  batchCount ) - 1 );
     // 输出
     for (  size_t index = 0 ; index < resultCount; index ++ )
    {
        printf( " %d\t%f\n " ,  index, pbuf[index ] );
    }
    fin.close();
    delete pbuf;
    pbuf = NULL;
}

性能测试结果： p4 的 cpu，每秒大约处理 30万个记录。
整个程序：

const char * filName = " c:\\float.df " ;
void dataPrepare( const char * filName  );
void helpInfo();
void dataOrder( const char * filName  );
int main( int argc, char * argv[] )
{
     try
    {
         if   ( argc == 1 )
        {
            helpInfo();
             return 0 ;
        }
         const char * filename = argv[ 1 ] + 2 ;
         if ( filename != NULL && strlen( filename ) > 0 )
        {
            filName = filename;
        }
         switch ( argv[ 1 ][ 1 ] )
        {
         case ' g ' :
            dataPrepare( filName  );
             break ;

         case ' o ' :
            dataOrder( filName  );
             break ;
         default :
            helpInfo();
             return 0 ;
             break ;
        }
    }
     catch (   const char * e)
    {
        cout <<   e << endl;
    }
     catch (   海量数据处理：一亿个浮点数的排序算法

  )
    {
        cout <<    " unknown error " << endl;
    }
    system( " pause " );
     return 0 ;
}

Referance:
快速排序
the c++ programming lanauage, by bjarne stroustrup chapter 18: Algorithms and Function Objects
线性同余法生成随机数
Introduction to Algorithms, Second Edition,by Thomas H. Cormen, Charles E. 11.3 Hash functions 介绍了线性同余法的原理和用法。

三平均分区法：
Introduction to Algorithms, Second Edition,by Thomas H. Cormen, Charles E.　Problems 7-5: Median-of-3 partition

秒客网

海量数据处理：一亿个浮点数的排序算法

相关文章