面试题——大数据处理解题思路

(1)给一个超过100G大小的log file，log中存着IP地址，设计算法找到出现次数最多的IP地址？

解题思路：IP地址是一个32位的字符串，100G在内存中肯定放不下，所以我们必须通过不同的手段去解决。我们可以将100G的文件切分成1000个小文件，对每个文件编号，每个文件的大小是512M，这样内存是很容易读取的。我们将IP地址通过字符串哈希函数处理成对应的整树，将整数模上文件个数（1000），对其取余，余数是几就放在编号为几的文件里。（这样，相同的IP肯定会出现在同一个文件里）分好之后，取每个文件中IP次数出现最多的IP进行比较，这时候问题转化成了1000个IP次数找次数出现最多的一个IP。

(2)与上题条件相同，如何找到top K的IP？如何直接用Linux系统命令实现？

解题思路：前半部分的处理和上题一样，后半部分处理不同的是，我们先建立一个堆（注意要建立小堆）。每个文件寻找出前K个出现次数较多的IP，1000分文件都找出前K个次数较多的IP，最后1000K个IP中在继续小堆查找判断，最后还是能找出前K个次数出现较多的IP

(3)给定100亿个整数，设计一个算法找到只出现一次的整数。

解题思路：整数一共有42亿多个，100亿个数中一定有重复的数字。
1M = 1024K = 2^10K
1K = 1024B = 2^10B
1B = 1024b = 2^10b
故：1G = 2^10M
= 2^10 * 2^10B
= 2^10 * 2^10 * 2^10b
如果一个整数占用一个字节，那么1G的内存可以处理约10亿的整数，一共有42亿多个无重复的整数，那么就需要4G的内存。1byte = 8位，每个位判断该数字存在与不存在，那么就需要4G/8 = 512M的内存，内存占用率大大降低，提高了效率。
然而，一个位只能表示这个数存在与否，不能断定这个数出现了几次。这个时候，我们应该考虑用两个位来存放一个数的状态。00表示这个数出现了0次，01表示这个数出现了1次，10表示这个数出现了多次，11状态我们暂且不用。最后，我们找出状态是01的数就可以了。
那么，这样占多少内存呢？一个位处理42亿多个整数占用了512M的内存，那么两个位处理42亿多个整数占用了1G的内存。如果这个时候，考官还想你500M左右处理这道题呢？那么，我们可以考虑正整数和负整数分开处理，也可以达到考官的要求。

(4)给两个文件，分别有100亿个整数，我们只有1G内存，如何找到两个文件的交集？

解题思路：给定1000个小文件，每个小文件都编号。对A，B两个大文件的100亿个数切分，每个数对1000取余，余数为几就放在编号为几的文件中。最后比较的时候，我们只取编号相同的小文件去比较。（将100亿个数分到1000个小文件中，每个文件大小512M，每次取A，B文件进行比较，两个文件刚好1G，符合题意）

(5)一个文件有100亿个int，1G内存，设计算法找出出现次数不超过2次的所有整数。

解题思路：这道题和第3题的解题思路是一样的，也是设置两个位来存放一个数的状态，00表示不存在，01表示出现一次，10表示出现两次，11表示出现多次。这时候我们只有知道状态为01或者10的数就可以了。两个位表示所需要占用的内存为1G（计算过程参考第三题）

(6)给两个文件，分别有100亿个query，我们只有1G内存，如何找到两个文件交集？给出精确算法和近似算法。

解题思路：
精确算法：同第四题的思路相同
近似算法：布隆过滤器

<strong><span style="font-size:18px;">#include<iostream>  
#include<vector>  
#include<string>  
using namespace std;  
  
struct HashFunc1  
{  
    size_t operator()(const char* str)  
    {  
        register size_t hash = 0;  
        while (size_t ch = (size_t)*str++)  
        {  
            hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313..        
        }  
            return hash;  
    }  
};  
  
struct HashFunc2  
{  
    size_t operator()(const char* str)  
    {  
        register size_t hash = 0;  
        while (size_t ch = (size_t)*str++)  
        {  
            hash = 65599 * hash + ch;  
                        //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;    
        }  
        return hash;  
    }  
};  
  
struct HashFunc3  
{  
    size_t operator()(const char* str)  
    {  
        register size_t hash = 0;  
        size_t magic = 63689;  
        while (size_t ch = (size_t)*str++)  
        {  
            hash = hash * magic + ch;  
            magic *= 378551;  
        }  
        return hash;  
    }  
};  
  
struct HashFunc4  
{  
    size_t operator()(const char* str)  
    {  
        register size_t hash = 0;  
        size_t ch;  
        for (long i = 0; ch = (size_t)*str++; i++)  
        {  
            if ((i & 1) == 0)  
            {  
                hash ^= ((hash << 7) ^ ch ^ (hash >> 3));  
            }  
            else  
            {  
                hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));  
            }  
        }  
        return hash;  
    }  
};  
  
struct HashFunc5  
{  
    size_t operator()(const char* str)  
    {  
        if (!*str)          
           return 0;  
        register size_t hash = 1315423911;  
        while (size_t ch = (size_t)*str++)  
        {  
            hash ^= ((hash << 5) + ch + (hash >> 2));  
        }  
        return hash;  
    }  
};  
//一个字符串类型的布隆过滤器  
template<class T=string,class _HashFunc1=HashFunc1,  
        class _HashFunc2=HashFunc2,class _HashFunc3=HashFunc3,  
        class _HashFunc4=HashFunc4, class _HashFunc5=HashFunc5>  
class BloomFilter  
{  
public:  
    BloomFilter(int num)  
        :_range(num * 5)  
    {  
        _RefBloomfilter.resize(_range);  
    }  
  
    void Set(const char* str)  
    {  
        //分别通过5个字符串hash函数得到5个对应的位置  
        size_t hash1 = HashFunc1()(str) %_range;   
        size_t hash2 = HashFunc2()(str) % _range;  
        size_t hash3 = HashFunc3()(str) % _range;  
        size_t hash4 = HashFunc4()(str) % _range;  
        size_t hash5 = HashFunc5()(str) % _range;  
        _RefBloomfilter[hash1]++;  
        _RefBloomfilter[hash2]++;  
        _RefBloomfilter[hash3]++;  
        _RefBloomfilter[hash4]++;  
        _RefBloomfilter[hash5]++;  
        printf("%d,%d,%d,%d,%d", hash1, hash2, hash3, hash4, hash5);  //可以输出5个位置看一看  
        cout << endl;  
    }  
  
    bool ReSet(const char* str)  //删除  
    {  
        size_t hash1 = HashFunc1()(str) % _range;  
        size_t hash2 = HashFunc2()(str) % _range;  
        size_t hash3 = HashFunc3()(str) % _range;  
        size_t hash4 = HashFunc4()(str) % _range;  
        size_t hash5 = HashFunc5()(str) % _range;  
        if (_RefBloomfilter[hash1] == 0 || _RefBloomfilter[hash2] == 0   
            || _RefBloomfilter[hash3] == 0 || _RefBloomfilter[hash4] == 0   
            || _RefBloomfilter[hash5] == 0)  
            return false;  
        _RefBloomfilter[hash1]--;  
        _RefBloomfilter[hash2]--;  
        _RefBloomfilter[hash3]--;  
        _RefBloomfilter[hash4]--;  
        _RefBloomfilter[hash5]--;  
        return true;  
    }  
  
    bool Test(const char* str)  //查找  
    {  
        size_t hash1 = HashFunc1()(str) % _range;  
        size_t hash2 = HashFunc2()(str) % _range;  
        size_t hash3 = HashFunc3()(str) % _range;  
        size_t hash4 = HashFunc4()(str) % _range;  
        size_t hash5 = HashFunc5()(str) % _range;  
        if (_RefBloomfilter[hash1]&&  
            _RefBloomfilter[hash2] &&   
            _RefBloomfilter[hash3] &&   
            _RefBloomfilter[hash4] &&   
            _RefBloomfilter[hash5])//都不为0则存在  
            return true;  
        else  
            return false;  
    }  
protected:  
    vector<size_t> _RefBloomfilter;  //为了支持删除，直接使用一个size_t 的vector来做引用计数  
    size_t _range;  
};  
  
void TestBloomfilter()  
{  
    BloomFilter<> bf(100);  
    string url1("http://write.blog.csdn.net/postedit/53088473");  
    string url2("http://blog.csdn.net/pointer_y/article/details/52926334");  
    string url3("http://blog.csdn.net/pointer_y/article/details/52776595");  
    bf.Set(url1.c_str());  
    bf.Set(url2.c_str());  
    bf.Set(url3.c_str());  
    cout << bf.Test(url1.c_str()) << endl;  
    bf.ReSet(url1.c_str());  
    cout << bf.Test(url1.c_str())<<endl;  
}  </span></strong>

(7)如何扩展BloomFilter（布隆过滤器）使得它支持删除元素的操作？

解题思路：利用计数器，不过这样的布隆过滤器其实效率就不高了

<strong><span style="font-size:18px;">#include<iostream>  
#include<vector>  
#include<string>  
using namespace std;  
  
struct HashFunc1  
{  
    size_t operator()(const char* str)  
    {  
        register size_t hash = 0;  
        while (size_t ch = (size_t)*str++)  
        {  
            hash = hash * 131 + ch;   // 也可以乘以31、131、1313、13131、131313..        
        }  
            return hash;  
    }  
};  
  
struct HashFunc2  
{  
    size_t operator()(const char* str)  
    {  
        register size_t hash = 0;  
        while (size_t ch = (size_t)*str++)  
        {  
            hash = 65599 * hash + ch;  
                        //hash = (size_t)ch + (hash << 6) + (hash << 16) - hash;    
        }  
        return hash;  
    }  
};  
  
struct HashFunc3  
{  
    size_t operator()(const char* str)  
    {  
        register size_t hash = 0;  
        size_t magic = 63689;  
        while (size_t ch = (size_t)*str++)  
        {  
            hash = hash * magic + ch;  
            magic *= 378551;  
        }  
        return hash;  
    }  
};  
  
struct HashFunc4  
{  
    size_t operator()(const char* str)  
    {  
        register size_t hash = 0;  
        size_t ch;  
        for (long i = 0; ch = (size_t)*str++; i++)  
        {  
            if ((i & 1) == 0)  
            {  
                hash ^= ((hash << 7) ^ ch ^ (hash >> 3));  
            }  
            else  
            {  
                hash ^= (~((hash << 11) ^ ch ^ (hash >> 5)));  
            }  
        }  
        return hash;  
    }  
};  
  
struct HashFunc5  
{  
    size_t operator()(const char* str)  
    {  
        if (!*str)          
           return 0;  
        register size_t hash = 1315423911;  
        while (size_t ch = (size_t)*str++)  
        {  
            hash ^= ((hash << 5) + ch + (hash >> 2));  
        }  
        return hash;  
    }  
};  
//一个字符串类型的布隆过滤器  
template<class T=string,class _HashFunc1=HashFunc1,  
        class _HashFunc2=HashFunc2,class _HashFunc3=HashFunc3,  
        class _HashFunc4=HashFunc4, class _HashFunc5=HashFunc5>  
class BloomFilter  
{  
public:  
    BloomFilter(int num)  
        :_range(num * 5)  
    {  
        _RefBloomfilter.resize(_range);  
    }  
  
    void Set(const char* str)  
    {  
        //分别通过5个字符串hash函数得到5个对应的位置  
        size_t hash1 = HashFunc1()(str) %_range;   
        size_t hash2 = HashFunc2()(str) % _range;  
        size_t hash3 = HashFunc3()(str) % _range;  
        size_t hash4 = HashFunc4()(str) % _range;  
        size_t hash5 = HashFunc5()(str) % _range;  
        _RefBloomfilter[hash1]++;  
        _RefBloomfilter[hash2]++;  
        _RefBloomfilter[hash3]++;  
        _RefBloomfilter[hash4]++;  
        _RefBloomfilter[hash5]++;  
        printf("%d,%d,%d,%d,%d", hash1, hash2, hash3, hash4, hash5);  //可以输出5个位置看一看  
        cout << endl;  
    }  
  
    bool ReSet(const char* str)  //删除  
    {  
        size_t hash1 = HashFunc1()(str) % _range;  
        size_t hash2 = HashFunc2()(str) % _range;  
        size_t hash3 = HashFunc3()(str) % _range;  
        size_t hash4 = HashFunc4()(str) % _range;  
        size_t hash5 = HashFunc5()(str) % _range;  
        if (_RefBloomfilter[hash1] == 0 || _RefBloomfilter[hash2] == 0   
            || _RefBloomfilter[hash3] == 0 || _RefBloomfilter[hash4] == 0   
            || _RefBloomfilter[hash5] == 0)  
            return false;  
        _RefBloomfilter[hash1]--;  
        _RefBloomfilter[hash2]--;  
        _RefBloomfilter[hash3]--;  
        _RefBloomfilter[hash4]--;  
        _RefBloomfilter[hash5]--;  
        return true;  
    }  
  
    bool Test(const char* str)  //查找  
    {  
        size_t hash1 = HashFunc1()(str) % _range;  
        size_t hash2 = HashFunc2()(str) % _range;  
        size_t hash3 = HashFunc3()(str) % _range;  
        size_t hash4 = HashFunc4()(str) % _range;  
        size_t hash5 = HashFunc5()(str) % _range;  
        if (_RefBloomfilter[hash1]&&  
            _RefBloomfilter[hash2] &&   
            _RefBloomfilter[hash3] &&   
            _RefBloomfilter[hash4] &&   
            _RefBloomfilter[hash5])//都不为0则存在  
            return true;  
        else  
            return false;  
    }  
protected:  
    vector<size_t> _RefBloomfilter;  //为了支持删除，直接使用一个size_t 的vector来做引用计数  
    size_t _range;  
};  
  
void TestBloomfilter()  
{  
    BloomFilter<> bf(100);  
    string url1("http://write.blog.csdn.net/postedit/53088473");  
    string url2("http://blog.csdn.net/pointer_y/article/details/52926334");  
    string url3("http://blog.csdn.net/pointer_y/article/details/52776595");  
    bf.Set(url1.c_str());  
    bf.Set(url2.c_str());  
    bf.Set(url3.c_str());  
    cout << bf.Test(url1.c_str()) << endl;  
    bf.ReSet(url1.c_str());  
    cout << bf.Test(url1.c_str())<<endl;  
}  </span></strong>

(8)如何扩展BloomFliter（布隆过滤器）使得它支持计数操作？

解题思路：同上。

秒客网

面试题——大数据处理解题思路

相关文章