【.Net】 大文件可使用的文本分组统计工具(附带源码,原创)

时间:2022-06-19 15:59:47

本工具可实现的效果:

1.读取大文件(大于1GB)

2.根据分隔符分割后的列分组

3.速度快。

4.处理过程中,可以随时停止处理,操作不卡死。

5.有对当前内存的实时监测,避免过多占用内存,影响系统运行。

6.实时显示处理的行数。

处理类代码:

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text; namespace DaZhongLogTool
{ // 定义事件的参数类
public class ValueEventArgs : EventArgs
{
public int Value { set; get; }
}
// 定义事件使用的委托
public delegate void ValueChangedEnentHandler(object sender, ValueEventArgs e); public class BigFileTongJiJobs
{
long ALLOW_MAX_USED_MEMORY = 1024 * 1024 * 1024; //允许使用的最大内存,超过则结束 public bool StartFlag { get; set; } // 定义一个事件来提示界面工作的进度
public event ValueChangedEnentHandler ValueChanged; public void OnValueChange(ValueEventArgs e)
{
if (ValueChanged != null)
{
ValueChanged(this, e);
}
} /// <summary>
///
/// </summary>
/// <param name="paramsInfo"></param>
/// <returns>-1:未开始,就失败了;-2:文件不存在;-3,异常;大于0,处理成功</returns>
public int StartAnalyseBigFile(TongjiParamsInfoStruct paramsInfo)
{
int handleLine = -1; string sTmpFile = paramsInfo.outputPath;
if (File.Exists(sTmpFile))
{
File.Delete(sTmpFile);
} if (!System.IO.File.Exists(sTmpFile))
{
FileStream fs;
fs = File.Create(sTmpFile);
fs.Close();
} if (!File.Exists(paramsInfo.inputPath))
{
handleLine = -2;
return handleLine;
} FileStream streamInput = System.IO.File.OpenRead(paramsInfo.inputPath);
FileStream streamOutput = System.IO.File.OpenWrite(sTmpFile); int iRowCount = 0;
int iRowCharCount = 0;
List<byte> rowByteData = new List<byte>();//行字节List
Dictionary<string, int> tongjiDict = new Dictionary<string, int>(); //统计字典
string rowStr = ""; //获取当前进程对象
Process cur = Process.GetCurrentProcess();
//为获取当前进程使用的内存大小做准备
PerformanceCounter curpc = new PerformanceCounter("Process", "Working Set", cur.ProcessName);
string memoryUsedSize = "";
try
{
ValueEventArgs e;
int result; //根据当前进程使用内存的大小,决定是否继续分析日志文本
memoryUsedSize = string.Format("分析开始,本进程使用内存大小:{0} KB,Date:{1}", curpc.NextValue() / 1024, DateTime.Now);
streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes(memoryUsedSize), 0, System.Text.UTF8Encoding.UTF8.GetBytes(memoryUsedSize).Length);
streamOutput.WriteByte(13); //换行符 while ((result = streamInput.ReadByte()) != -1)
{ if (StartFlag == false)
{
streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes("强制停止分析"), 0, System.Text.UTF8Encoding.UTF8.GetBytes("强制停止分析").Length);
streamOutput.WriteByte(13);
break;
} if (result == 10)
continue;
iRowCharCount++;
rowByteData.Add((byte)result);
if (result == 13) //一行
{ //写入一次或者处理一次
rowStr = GetSpecificInfoFromLineText(rowByteData, paramsInfo.separator, paramsInfo.columnNum);
if (!string.IsNullOrEmpty(rowStr))
{
rowStr = rowStr.Length > 300 ? rowStr.Substring(0, 300) : rowStr;
if (tongjiDict.ContainsKey(rowStr))
tongjiDict[rowStr] = tongjiDict[rowStr] + 1;
else
tongjiDict[rowStr] = 1;
} if (iRowCount % 10000 == 0 || iRowCount<100) //不频繁的更新UI可以极大的提高处理的效率,如果每条都更新UI,将会非常慢
{
//占用内存大于1GB,则结束本次的分析
if (curpc.NextValue() > ALLOW_MAX_USED_MEMORY)//当前进程使用内存的大小大于1个GB,停止分析
{
break;
} e = new ValueEventArgs() { Value = iRowCount };
this.OnValueChange(e);
} iRowCount++; //统计处理的行数
iRowCharCount = 0;//本行的字符数
rowByteData.Clear();//清空本行数据
}
} if (tongjiDict.Count> 1)
{
//根据当前进程使用内存的大小,决定是否继续分析日志文本
memoryUsedSize = string.Format("分析结束:本进程使用内存大小:{0} KB,Date:{1},分组个数:{2}", curpc.NextValue() / 1024, DateTime.Now,tongjiDict.Count);
streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes(memoryUsedSize), 0, System.Text.UTF8Encoding.UTF8.GetBytes(memoryUsedSize).Length);
streamOutput.WriteByte(13); //换行符
} streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes("本次处理的文本对象是"), 0, System.Text.UTF8Encoding.UTF8.GetBytes("本次处理的文本对象是").Length);
streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes(paramsInfo.inputPath), 0, System.Text.UTF8Encoding.UTF8.GetBytes(paramsInfo.inputPath).Length);
streamOutput.WriteByte(13); string temLine;
foreach (var item in tongjiDict.OrderByDescending(t => t.Value))
{
temLine = string.Format("统计次数Value:{0}\t Key: {1}", item.Value, item.Key);
streamOutput.Write(System.Text.UTF8Encoding.UTF8.GetBytes(temLine), 0, System.Text.UTF8Encoding.UTF8.GetBytes(temLine).Length);
streamOutput.WriteByte(13); //换行符
}
//更新处理到最后一条的文字提示状态
e = new ValueEventArgs() { Value = iRowCount };
this.OnValueChange(e);
}
finally
{
streamInput.Dispose();
streamOutput.Dispose();
} return iRowCount;
} //从文本行中提取特定信息
private string GetSpecificInfoFromLineText(List<byte> lineArr, string separator, int columnNum)
{
string result;
try
{
string lineStr;
string[] columnArr;
lineStr = System.Text.UTF8Encoding.UTF8.GetString(lineArr.ToArray());
//把文本中的 "\t",替换为分隔符 "\\t",原因是:输入的分隔符是:“\t”,为了避免被转移,系统自动把输入的分隔符变成了:“\\t”
//去掉\r后面或者前面的\n,避免输出的文本中根据\n换行
columnArr = lineStr.Replace("\t", "\\t").Replace('\n', ' ').Split(new string[] { separator }, StringSplitOptions.None);
if (columnArr.Length < columnNum)
{
return "";
}
result = columnArr[columnNum - 1];
}
catch (Exception)
{
result = "ExceptionLine";
//throw;
}
return result;
} } public struct TongjiParamsInfoStruct
{
public string inputPath { get; set; }
public string outputPath { get; set; }
public string separator { get; set; }
public int columnNum { get; set; } }
}

  调用代码:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Diagnostics;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Windows.Forms; namespace DaZhongLogTool
{
public partial class Form3 : Form
{
public Form3()
{
InitializeComponent();
} Color originalTongJiButtonColor;
string originalTongJiButtonText; BigFileTongJiJobs tongjiJobs = new BigFileTongJiJobs(); private void btnTongJi_Click(object sender, EventArgs e)
{
if (tongjiJobs.StartFlag)
{
MessageBox.Show("正在处理中...,如需停止,请单击停止");
return;
} string errmsg;
if(string.IsNullOrEmpty(txtSeparator.Text))
{
MessageBox.Show("请输入分隔符");
return;
}
if (numericUpDown1.Value<1)
{
MessageBox.Show("请输入按照分隔符分割的待统计内容的对应的列数,从1开始");
return;
} string inputPath = txtInputPath.Text.Trim();
if (string.IsNullOrEmpty(inputPath))
{
MessageBox.Show("请输入等待统计的文本路径");
return;
}
if(!File.Exists(inputPath))
{
MessageBox.Show("待统计的文本文件不存在,请重新输入");
return;
} TongjiParamsInfoStruct paramsInfo = new TongjiParamsInfoStruct();
paramsInfo.inputPath = inputPath;
paramsInfo.outputPath = System.IO.Path.GetDirectoryName(inputPath)+@"\"+DateTime.Now.ToString("yyyyMMdd_HHmm")+"_result.log";
paramsInfo.separator = txtSeparator.Text;
paramsInfo.columnNum = (int)numericUpDown1.Value; originalTongJiButtonColor = this.btnTongJi.BackColor;
originalTongJiButtonText = this.btnTongJi.Text; ////开始分析前,改变按钮颜色及文字
//this.btnTongJi.Enabled = false;
//this.btnTongJi.BackColor = Color.Gray;
//this.btnTongJi.Text = "处理中……"; tongjiJobs.StartFlag = true;
tongjiJobs.ValueChanged += new ValueChangedEnentHandler(Line_ValueChange); Func<TongjiParamsInfoStruct, int> hander = new Func<TongjiParamsInfoStruct, int>(tongjiJobs.StartAnalyseBigFile);
hander.BeginInvoke(paramsInfo, new AsyncCallback(AsyncCallback1), hander); } // 结束异步操作
private void AsyncCallback1(IAsyncResult ar)
{
// 标准的处理步骤
Func<TongjiParamsInfoStruct, int> handler = ar.AsyncState as Func<TongjiParamsInfoStruct, int>;
int result= handler.EndInvoke(ar); if (result>0)
{
MessageBox.Show("本次成功处理了" + result + "行数据", "成功");
}
else if (result == -2)
{
MessageBox.Show("文件不存在,请重新选择");
}
toolStripStatusLabel1.Text = "上次任务处理完毕,等待下次开始。" + DateTime.Now.ToString(); tongjiJobs.StartFlag = false;//处理过程停止 //恢复按钮颜色及文字
//this.btnTongJi.Enabled = true;
//this.btnTongJi.BackColor = originalTongJiButtonColor;
//this.btnTongJi.Text = originalTongJiButtonText; } private void Line_ValueChange(object sender ,ValueEventArgs e)
{
toolStripStatusLabel1.Text = string.Format("统计中……,已处理了{0}行日志,时间:{1}", e.Value, DateTime.Now);
} private void btnSelectFile_Click(object sender, EventArgs e)
{
OpenFileDialog fileDialog = new OpenFileDialog();
fileDialog.Multiselect = false;
fileDialog.Filter = "(*.*)|*.*";
fileDialog.RestoreDirectory = false; if (fileDialog.ShowDialog() == DialogResult.OK)
{
try
{
txtInputPath.Text=fileDialog.FileName;
}
catch (Exception ex)
{
MessageBox.Show("Error: Could not read file from disk. Original error: " + ex.Message);
}
}
} private void btnStopTongJi_Click(object sender, EventArgs e)
{
//点击停止按钮
tongjiJobs.StartFlag = false;
} } }

  该工具,是来自于实际工作的需求,用于根据某一列统计次数。简单实用。

源代码下载:【源码大文件分组统计简单工具          【EXE】大文件分组统计简单工具

需要的小伙伴尽管拿走,不要忘记推荐一下,谢谢