最近编写NTFS文件实时搜索工具, 类似 Everything 这样, 速度快还小巧, 于是花了约3周进行研究, 总结下学习过程中一些经验
实现分3部分完成
一. 解析NTFS 主文件表(MFT)
这一步是获取文件数据的唯一迅速且可靠的来源
NTFS_MFT_Parse.h
#pragma once
#include "NTFS_Base.h"
#include <functional>
#define NTFS_MFT_PARSE_FILE_BUF_COUNT (1024 * 16) // MFT块分析文件项缓冲数量
// 文件名处理回调
using NtfsFilenameCb = std::function<bool(const NTFS_VOLUME_INFO& volInfo, PNTFS_FILE_RECORD_HEADER pHeader, FILE_REFERENCE record, const MFT_FILE_INFO_LIST& mftInfoList)>;
// NTFS 主文件表 解析类
// FlameCyclone 2024.12.11
class NTFS_MFT_Parse:
public NTFS_Base
{
public:
NTFS_MFT_Parse();
~NTFS_MFT_Parse();
//
// @brief: 解析所有 NTFS 卷文件记录
// @param: volFileList 卷文件列表
// @param: dwDriveIndexMask 驱动器索引掩码(位组合: C: 0x01 D: 0x02 E: 0x04 F: 0x08...)
// @ret: bool 操作是否成功
static bool ParseRecord(
DWORD dwDriveIndexMask = 0xFFFFFFFF,
NtfsFilenameCb cb = nullptr
);
private:
// 解析卷
static bool _ParseVolume(
const NTFS_VOLUME_INFO& volInfo,
NtfsFilenameCb cb
);
// 解析主文件表
static bool _ParseMasterFileTable(
HANDLE hFile,
PNTFS_BOOT_RECORD pBootRecord,
PNTFS_FILE_RECORD_HEADER pFileHeaderStart,
PNTFS_ATTRIBUTE_HEADER pAttrHeaderStart,
const NTFS_VOLUME_INFO& volInfo,
NtfsFilenameCb cb
);
// 获取文件记录
static bool _GetFileRecord(
HANDLE hFile,
FILE_REFERENCE RecordNumber,
PNTFS_FILE_RECORD_HEADER pOutFileHeader
);
// 解析文件名
static bool _ParseFileRecord_20_AttributeList(
HANDLE hFile,
PNTFS_BOOT_RECORD pBootRecord,
PNTFS_FILE_RECORD_HEADER pFileHeaderStart,
PNTFS_ATTRIBUTE_HEADER pAttrHeaderStart,
MFT_FILE_INFO_LIST& fileInfoList,
std::vector<FILE_REFERENCE>& vRes
);
// 解析文件名
static bool _ParseFileRecord_30_GetFilename(
HANDLE hFile,
PNTFS_BOOT_RECORD pBootRecord,
PNTFS_FILE_RECORD_HEADER pFileHeaderStart,
PNTFS_ATTRIBUTE_HEADER pAttrHeaderStart,
MFT_FILE_INFO_LIST& fileInfoList
);
// 解析文件名
static bool _ParseFileRecord_30_Filename(
HANDLE hFile,
PNTFS_BOOT_RECORD pBootRecord,
PNTFS_FILE_RECORD_HEADER pFileHeaderStart,
PNTFS_ATTRIBUTE_HEADER pAttrHeaderStart,
MFT_FILE_INFO_LIST& fileInfoList
);
// 解析索引根
static bool _ParseFileRecord_90_IndexRoot(
HANDLE hFile,
PNTFS_BOOT_RECORD pBootRecord,
PNTFS_FILE_RECORD_HEADER pFileHeaderStart,
PNTFS_ATTRIBUTE_HEADER pAttrHeaderStart,
MFT_FILE_INFO_LIST& fileInfoList
);
// 解析索引根
static bool _ParseFileRecord_A0_IndexAllocation(
HANDLE hFile,
PNTFS_BOOT_RECORD pBootRecord,
PNTFS_FILE_RECORD_HEADER pFileHeaderStart,
PNTFS_ATTRIBUTE_HEADER pAttrHeaderStart,
MFT_FILE_INFO_LIST& fileInfoList
);
// 解析文件记录
static bool _ParseFileRecordAttributes(
HANDLE hFile,
PNTFS_BOOT_RECORD pBootRecord,
PNTFS_FILE_RECORD_HEADER pFileHeaderStart,
PNTFS_ATTRIBUTE_HEADER pAttrHeaderStart,
MFT_FILE_INFO_LIST& fileInfoList
);
// 解析主文件表数据
static bool _ParseMasterFileTableData(
HANDLE hFile,
PNTFS_BOOT_RECORD pBootRecord,
PNTFS_FILE_RECORD_HEADER pFileHeaderStart,
PNTFS_ATTRIBUTE_HEADER pAttrHeaderStart,
const NTFS_VOLUME_INFO& volInfo,
NtfsFilenameCb cb
);
// 设置文件偏移
static bool SetFileOffset(
_In_ HANDLE hFile,
_In_ uint64_t liDistanceToMove,
_Out_opt_ uint64_t* lpNewFilePointer,
_In_ DWORD dwMoveMethod = FILE_BEGIN
);
};
二. 监控 USN 日志
为了动态监控每个文件的新建, 删除, 更名
NTFS_USN_Parse.h
#include "NTFS_USN_Parse.h"
#include <thread>
NTFS_USN_Parse::NTFS_USN_Parse()
{
}
NTFS_USN_Parse::~NTFS_USN_Parse()
{
}
bool NTFS_USN_Parse::_CreateJournal(
HANDLE hFile,
DWORDLONG MaximumSize,
DWORDLONG AllocationDelta
)
{
// https://learn.microsoft.com/zh-cn/windows/win32/api/winioctl/ns-winioctl-create_usn_journal_data
CREATE_USN_JOURNAL_DATA usnCreate = { 0 };
DWORD dwBytes = 0;
usnCreate.MaximumSize = MaximumSize; // NTFS 文件系统为更改日志分配的目标最大大小(以字节为单位)
usnCreate.AllocationDelta = AllocationDelta; // 添加到末尾并从更改日志开头删除的内存分配的大小
if (!::DeviceIoControl(
hFile,
FSCTL_CREATE_USN_JOURNAL,
&usnCreate, sizeof(usnCreate),
NULL, 0,
&dwBytes,
NULL))
{
return false;
}
return true;
}
bool NTFS_USN_Parse::_QueryJournal(
HANDLE hFile,
PUSN_JOURNAL_DATA_V2 pUsnJournalData
)
{
DWORD dwBytes = 0;
if (!::DeviceIoControl(
hFile,
FSCTL_QUERY_USN_JOURNAL,
NULL,
0,
pUsnJournalData,
sizeof(USN_JOURNAL_DATA_V2),
&dwBytes,
NULL))
{
return false;
}
return true;
}
bool NTFS_USN_Parse::_ReadJournal(
HANDLE hFile,
PREAD_USN_JOURNAL_DATA pUsnData,
LPVOID lpBuf,
DWORD dwBufsize,
LPDWORD dwBytes
)
{
if ( !::DeviceIoControl(
hFile,
FSCTL_READ_USN_JOURNAL,
pUsnData,
sizeof(READ_USN_JOURNAL_DATA),
lpBuf,
dwBufsize,
dwBytes,
NULL) )
{
return false;
}
return true;
}
void NTFS_USN_Parse::_ParseJournalV2(
HANDLE hFile,
const NTFS_VOLUME_INFO& volInfo,
PUSN_RECORD_V2 pUsnRecord,
NTFS_USN_INFO& usnInfo
)
{
usnInfo.strFileName = WStrToTStr(std::wstring(pUsnRecord->FileName, pUsnRecord->FileNameLength / sizeof(wchar_t)));
usnInfo.Reason.Data = pUsnRecord->Reason;
usnInfo.UpdateSequenceNumber = pUsnRecord->Usn;
usnInfo.ReferenceNumber = *(PFILE_REFERENCE)&pUsnRecord->FileReferenceNumber;
usnInfo.ParentReferenceNumber = *(PFILE_REFERENCE)&pUsnRecord->ParentFileReferenceNumber;
usnInfo.uDriveIndex = volInfo.uDriveIndex;
usnInfo.FileAttributes = pUsnRecord->FileAttributes;
}
void NTFS_USN_Parse::_ParseJournalV3(
HANDLE hFile,
const NTFS_VOLUME_INFO& volInfo,
PUSN_RECORD_V3 pUsnRecord,
NTFS_USN_INFO& usnInfo
)
{
usnInfo.strFileName = WStrToTStr(std::wstring(pUsnRecord->FileName, pUsnRecord->FileNameLength / sizeof(wchar_t)));
usnInfo.Reason.Data = pUsnRecord->Reason;
usnInfo.UpdateSequenceNumber = pUsnRecord->Usn;
usnInfo.ReferenceNumber = *(PFILE_REFERENCE)&pUsnRecord->FileReferenceNumber;
usnInfo.ParentReferenceNumber = *(PFILE_REFERENCE)&pUsnRecord->ParentFileReferenceNumber;
usnInfo.uDriveIndex = volInfo.uDriveIndex;
usnInfo.FileAttributes = pUsnRecord->FileAttributes;
}
bool NTFS_USN_Parse::_MonitorJournal(
const NTFS_VOLUME_INFO& volInfo,
DWORD dwReasonMask,
UsnNotifyCb usnCb
)
{
uint8_t* pBufData = nullptr;
HANDLE hFile = INVALID_HANDLE_VALUE;
USN usnStart = 0;
DWORD dwRetBytes = 0;
DWORD dwBytes = 0;
DWORD dwError = 0;
bool fResult = false;
bool fAbort = false;
do
{
// 打开卷
hFile = OpenVolume(volInfo);
if (INVALID_HANDLE_VALUE == hFile)
{
break;
}
// 查询日志
USN_JOURNAL_DATA_V2 usnData = { 0 };
usnData.Flags = FLAG_USN_TRACK_MODIFIED_RANGES_ENABLE;
usnData.FirstUsn = usnStart;
if (!_QueryJournal(hFile, &usnData))
{
dwError = ::GetLastError();
if (!(ERROR_JOURNAL_DELETE_IN_PROGRESS == dwError || ERROR_JOURNAL_NOT_ACTIVE == dwError))
{
break;
}
}
// 日志未活动则创建日志
if (ERROR_JOURNAL_NOT_ACTIVE == ::GetLastError())
{
if (!_CreateJournal(hFile, NTFS_USN_MAX_SIZE, NTFS_USN_ALLOCATION_DELTA_SIZE))
{
break;
}
usnData.Flags = FLAG_USN_TRACK_MODIFIED_RANGES_ENABLE;
usnData.FirstUsn = usnStart;
if (!_QueryJournal(hFile, &usnData))
{
dwError = ::GetLastError();
if (!(ERROR_JOURNAL_DELETE_IN_PROGRESS == dwError || ERROR_JOURNAL_NOT_ACTIVE == dwError))
{
break;
}
}
}
usnStart = usnData.NextUsn;
// 分配解析缓冲
pBufData = new (std::nothrow) uint8_t[usnData.MaximumSize];
if (!pBufData)
{
break;
}
NTFS_USN_INFO usnInfoEmpty;
while (true)
{
// 查询日志
if (!_QueryJournal(hFile, &usnData))
{
dwError = ::GetLastError();
if (!(ERROR_JOURNAL_DELETE_IN_PROGRESS == dwError || ERROR_JOURNAL_NOT_ACTIVE == dwError))
{
fAbort = true;
break;
}
}
// 读取日志
READ_USN_JOURNAL_DATA usnRead = { 0, 0xFFFFFFFF, FALSE, 0, 0 };
usnRead.MinMajorVersion = usnData.MinSupportedMajorVersion;
usnRead.MaxMajorVersion = usnData.MaxSupportedMajorVersion;
usnRead.UsnJournalID = usnData.UsnJournalID;
usnRead.StartUsn = usnStart;
usnRead.ReasonMask = dwReasonMask;
if (!_ReadJournal(hFile, &usnRead, pBufData, (DWORD)usnData.MaximumSize, &dwBytes))
{
dwError = ::GetLastError();
if (!(ERROR_JOURNAL_DELETE_IN_PROGRESS == dwError || ERROR_JOURNAL_NOT_ACTIVE == dwError))
{
fAbort = true;
break;
}
}
dwRetBytes = dwBytes - sizeof(USN);
// 未查询到数据
if (!dwRetBytes)
{
// 空闲时也回调一下
if (!usnCb(false, usnInfoEmpty))
{
fAbort = true;
break;
}
usnStart = usnData.NextUsn;
::Sleep(NTFS_USN_UPDATE_TIME_INTERVAL);
continue;
}
// 解析查询结果
PUSN_RECORD_COMMON_HEADER pUsnHeader = (PUSN_RECORD_COMMON_HEADER)(pBufData + sizeof(USN));
if (2 == pUsnHeader->MajorVersion)
{
PUSN_RECORD_V2 pUsnRecord = (PUSN_RECORD_V2)(pBufData + sizeof(USN));
NTFS_USN_INFO usnInfo;
while (dwRetBytes > 0)
{
_ParseJournalV2(hFile, volInfo, pUsnRecord, usnInfo);
if (NTFSFileNameType::e05_Root == usnInfo.ParentReferenceNumber.RecordNumber ||
usnInfo.ParentReferenceNumber.RecordNumber >= NTFSFileNameType::e16_Unuse_Start)
{
if (usnCb)
{
if (!usnCb(true, usnInfo))
{
fAbort = true;
break;
}
}
}
dwRetBytes -= pUsnRecord->RecordLength;
pUsnRecord = (PUSN_RECORD_V2)(((PCHAR)pUsnRecord) + pUsnRecord->RecordLength);
}
}
if (3 == pUsnHeader->MajorVersion)
{
PUSN_RECORD_V3 pUsnRecord = (PUSN_RECORD_V3)(pBufData + sizeof(USN));
NTFS_USN_INFO usnInfo;
while (dwRetBytes > 0)
{
_ParseJournalV3(hFile, volInfo, pUsnRecord, usnInfo);
if (NTFSFileNameType::e05_Root == usnInfo.ParentReferenceNumber.RecordNumber ||
usnInfo.ParentReferenceNumber.RecordNumber >= NTFSFileNameType::e16_Unuse_Start)
{
if (usnCb)
{
if (!usnCb(true, usnInfo))
{
fAbort = true;
break;
}
}
}
dwRetBytes -= pUsnRecord->RecordLength;
pUsnRecord = (PUSN_RECORD_V3)(((PCHAR)pUsnRecord) + pUsnRecord->RecordLength);
}
}
// 空闲时也回调一下
if (!usnCb(false, usnInfoEmpty))
{
fAbort = true;
break;
}
// 下次解析起始日志号
usnStart = usnData.NextUsn;
::Sleep(NTFS_USN_UPDATE_TIME_INTERVAL);
}
if (fAbort)
{
break;
}
fResult = true;
} while (false);
if (INVALID_HANDLE_VALUE != hFile)
{
::CloseHandle(hFile);
}
if (pBufData)
{
delete[] pBufData;
}
return fResult;
}
bool NTFS_USN_Parse::MonitorJournal(
DWORD dwDriveIndexMask/* = 0xFFFFFFFF*/,
DWORD dwReasonMask/* = 0xFFFFFFFF*/,
UsnNotifyCb usnCb/* = nullptr*/
)
{
std::vector<NTFS_VOLUME_INFO> vecDriveList = GetVolumeInfoList();
std::vector<std::thread> vTask;
for (auto& item : vecDriveList)
{
if ((0x01 << item.uDriveIndex) & dwDriveIndexMask)
{
vTask.emplace_back(std::thread([&item, dwReasonMask, &usnCb]() {
_MonitorJournal(item, dwReasonMask, usnCb);
}
));
}
}
for (auto& item : vTask)
{
if (item.joinable())
{
item.join();
}
}
return true;
}
三. 数据库查询
采用Sqlite3进行数据库操作
NTFS_Search.h
#pragma once
#include "NTFS_Utils/NTFS_MFT_Parse.h"
#include "NTFS_Utils/NTFS_USN_Parse.h"
#include "Sqlite3/CSqlite3.h"
#include <thread>
#include <mutex>
#include <vector>
#define SQL_UPDATE_TIME_INTERVAL (1000) // 数据库更新间隔时间
// 计数
#define SQL_QUERY_COUNT R"(
SELECT count(*) AS count FROM file_list WHERE path NOT NULL
)"
// 更新根路径
#define SQL_QUERY_UPDATE_ROOT_PATH R"(
WITH RECURSIVE sub_tree(id, parent_id, name, path) AS (
SELECT id, parent_id, name, name AS path
FROM file_list
WHERE parent_id = 0
UNION ALL
SELECT c.id, c.parent_id, c.name, p.path || '\' || c.name
FROM file_list c
INNER JOIN sub_tree p ON c.parent_id = p.id
)
UPDATE file_list
SET path = (
SELECT path FROM sub_tree
WHERE sub_tree.id = file_list.id AND sub_tree.parent_id = file_list.parent_id AND sub_tree.name = file_list.name
);
)"
// 删除索引
#define SQL_QUERY_DROP_INDEX R"(
DROP INDEX IF EXISTS idx_file_list_id ON file_list;
DROP INDEX IF EXISTS idx_file_list_parent_id ON file_list;
--DROP INDEX IF EXISTS idx_file_list_name ON file_list;
--DROP INDEX IF EXISTS idx_file_list_path ON file_list;
--DROP INDEX IF EXISTS idx_file_list_id_parent_id_name_path ON file_list;
)"
// 创建索引
#define SQL_QUERY_CREATE_INDEX R"(
CREATE INDEX IF NOT EXISTS idx_file_list_id ON file_list(id);
CREATE INDEX IF NOT EXISTS idx_file_list_parent_id ON file_list(parent_id);
--CREATE INDEX IF NOT EXISTS idx_file_list_name ON file_list(name);
--CREATE INDEX IF NOT EXISTS idx_file_list_path ON file_list(path);
--CREATE INDEX IF NOT EXISTS idx_file_list_id_parent_id_name_path ON file_list(id, parent_id, name, path);
)"
// 更新子路径
#define SQL_QUERY_UPDATE_CHILD_PATH R"(
WITH RECURSIVE sub_tree(id, parent_id, name, path) AS (
SELECT id, parent_id, name, path AS path
FROM file_list
WHERE id = %llu AND parent_id = %llu
UNION ALL
SELECT c.id, c.parent_id, c.name, p.path || '\' || c.name
FROM file_list c
INNER JOIN sub_tree p ON c.parent_id = p.id
)
UPDATE file_list
SET path = st.path
FROM sub_tree st
WHERE file_list.id = st.id AND file_list.parent_id = st.parent_id AND file_list.name = st.name;
)"
// 更新文件路径
#define SQL_QUERY_UPDATE_FILE_PATH R"(
WITH RECURSIVE path_cte(id, parent_id, name, path) AS (
SELECT id, parent_id, name, name
FROM file_list
WHERE id = %llu AND parent_id = %llu AND name = "%s"
UNION ALL
SELECT f.id, f.parent_id, f.name, f.name || '\' ||p.path
FROM file_list f
INNER JOIN path_cte p ON (f.id = p.parent_id)
)
UPDATE file_list
SET path = (SELECT path FROM path_cte WHERE parent_id = 0)
WHERE id = %llu AND parent_id = %llu AND name = "%s";
)"
// 按文件名查找
#define SQL_QUERY_SEARCH_NAME R"(
SELECT path FROM file_list WHERE name like "%s" ESCAPE '\' AND path NOT NULL ORDER BY path LIMIT %lld
)"
// 按路径查找
#define SQL_QUERY_SEARCH_PATH R"(
SELECT path FROM file_list WHERE path like "%s" ESCAPE '\' AND path NOT NULL ORDER BY path LIMIT %lld
)"
// 搜索全部
#define SQL_QUERY_SEARCH_ALL R"(
SELECT path FROM file_list WHERE path NOT NULL ORDER BY path LIMIT %lld
)"
//删除表
#define SQL_QUERY_DELETE_TABLE R"(
DROP TABLE IF EXISTS file_list;
)"
//创建表
#define SQL_QUERY_CREATE_TABLE R"(
CREATE TABLE IF NOT EXISTS file_list (
id INTEGER NOT NULL,
parent_id INT,
name TEXT,
attr INT,
path TEXT,
PRIMARY KEY(id, parent_id, name)
);
)"
//建表更新数据
#define SQL_QUERY_REPLACE_PREPQRE R"(
REPLACE INTO file_list (id, parent_id, name, attr, path) VALUES (?, ?, ?, ?, ?);
)"
// 替换记录
#define SQL_QUERY_REPLACE R"(
REPLACE INTO file_list (id, parent_id, name, attr) VALUES (%llu, %llu, "%s", %d);
)"
// 删除记录
#define SQL_QUERY_DELETE R"(
DELETE FROM file_list where id = %llu AND parent_id = %llu AND name = "%s";
)"
// 数据库使用的文件ID
typedef union _SQL_FILE_ID
{
uint64_t data;
struct {
uint64_t ReferenceNumber : 32; // 文件引用号
uint64_t dwDriveIndex : 32; // 驱动器号
};
}SQL_FILE_ID;
bool operator < (const _SQL_FILE_ID& a, const _SQL_FILE_ID& b);
class NTFS_Search
{
public:
NTFS_Search();
~NTFS_Search();
//
// @brief: 初始化
// @param: dwDriveIndexMask 驱动器索引掩码(位组合: C: 0x01 D: 0x02 E: 0x04 F: 0x08...)
// @param: strDbPath 数据库文件路径
// @param: fRebuildDb 是否重建数据库
// @ret: bool 操作是否成功
bool Initialize(
DWORD dwDriveIndexMask = 0xFFFFFFFF,
const _tstring& strDbPath = _T(""),
bool fRebuildDb = true
);
//
// @brief: 初始化
// @param: strDriveList 驱动器列表
// @param: strDbPath 数据库文件路径
// @param: fRebuildDb 是否重建数据库
// @ret: bool 操作是否成功
bool Initialize(
const _tstring& strDriveList = _T("ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
const _tstring& strDbPath = _T(""),
bool fRebuildDb = true
);
//
// @brief: 反初始化
// @ret: void 操作是否成功
void Uninitialize();
//
// @brief: 搜索
// @param: strKeyWord 关键字
// @param: fileList 输出文件路径列表
// @param: nLimit 限制结果数量
// @ret: bool 操作是否成功
bool Search(
const _tstring& strKeyWord,
std::vector<_tstring>& fileList,
int64_t nLimit = -1
);
//
// @brief: 获取数据库文件总数
// @ret: size_t 数据库文件总数
size_t GetCount();
//
// @brief: 获取驱动器索引掩码
// @ret: DWORD 驱动器索引掩码
DWORD GetDriveListMask(const _tstring& strDriveList);
private:
bool _BuildDb(
DWORD dwDriveIndexMask,
const _tstring& strDbPath,
bool fRebuildDb
);
bool _BuildRecord(
DWORD dwDriveIndexMask
);
bool _BuildFile(
const NTFS_VOL_FILE_MAP& volFileList
);
bool _BuildDrive(
DWORD dwDriveIndexMask
);
bool _StartMonitor(DWORD dwDriveIndexMask);
void _Sleep(int dwMilliseconds);
int _UpdateFilePath(SQL_FILE_ID fileID, SQL_FILE_ID parentID, _tstring strFilename);
int _UpdateChildPath(SQL_FILE_ID fileID, SQL_FILE_ID parentID);
private:
std::vector<NTFS_USN_INFO> m_usnInfoList; // 日志变化列表
std::mutex m_mutex; // 互斥锁
_tstring m_strDbPath; // 数据库文件路径
CSqlite3 m_sql3; // 数据库
std::thread m_monTask; // 监控任务
std::thread m_sqlTask; // 数据库任务
bool m_fInit; // 初始化状态
bool m_fQuit; // 退出状态
bool m_fDbReady; // 数据库准备状态
};
对比:
性能上重建数据库耗时是everything的 2倍, 不过也差不多了, 以后慢慢优化
搜索速度比不上everything, 但是也算是秒速了