2024每日刷题(187)
Leetcode—1242. 多线程网页爬虫
实现代码
/**
* // This is the HtmlParser's API interface.
* // You should not implement it, or speculate about its implementation
* class HtmlParser {
* public:
* vector<string> getUrls(string url);
* };
*/
class Solution {
public:
vector<string> crawl(string startUrl, HtmlParser htmlParser) {
queue<string> q{{startUrl}};
unordered_set<string> ust{{startUrl}};
string hostname = getHostName(startUrl);
vector<thread> threads;
const int nthreads = std::thread::hardware_concurrency();
mutex mtx;
condition_variable cv;
auto t = [&] {
while(true) {
unique_lock<mutex> lock(mtx);
cv.wait_for(lock, 30ms, [&]() {
return q.size();
});
if(q.empty()) {
return;
}
auto cur = q.front();
q.pop();
lock.unlock();
vector<string> urls = htmlParser.getUrls(cur);
lock.lock();
for(const string& url: urls) {
if(ust.contains(url)) {
continue;
}
if(url.find(hostname) != string::npos) {
ust.insert(url);
q.push(url);
}
}
lock.unlock();
cv.notify_all();
}
};
for(int i = 0; i < nthreads; i++) {
threads.emplace_back(t);
}
for(auto& thread: threads) {
thread.join();
}
return {ust.begin(), ust.end()};
}
private:
string getHostName(string& s) {
int firstIdx = s.find_first_of('/');
int thirdIdx = s.find_first_of('/', firstIdx + 2);
return s.substr(firstIdx + 2, thirdIdx - firstIdx - 2);
}
};
运行结果
之后我会持续更新,如果喜欢我的文章,请记得一键三连哦,点赞关注收藏,你的每一个赞每一份关注每一次收藏都将是我前进路上的无限动力 !!!↖(▔▽▔)↗感谢支持!