I want to know what is the best option to read a txt file that contain two line of numbers using gets function in c and save them in an array within 1 second.
我想知道在c中使用gets函数读取包含两行数字的txt文件的最佳选项是什么,并在1秒内将它们保存在一个数组中。
Assume the following example as an txt file called ooo.txt and it has the number 2.000.000 in the first line (which will be the size of the array) and 2.000.000 number in the second line that will be stored in the array.
假设以下示例为名为ooo.txt的txt文件,第一行中的数字为2.000.000(将是数组的大小),第二行中的数字为2.000.000,将存储在数组中。
Eg
例如
2000000
59 595 45 492 89289 5 8959 (+1.999.993 numbers)
code i try (only the fcanf function)
我尝试的代码(只有fcanf函数)
int t_size;
fscanf(fp, "%d",&t_size); //bypass the first character!
int* my_array = NULL;
my_array = malloc(t_size*sizeof(*my_array));
if (my_array==NULL) {
printf("Error allocating memory!\n"); //print an error message
return 1; //return with failure
getchar();
}
int i =0;
for ( i = 0; i < t_size; i++ )
{
fscanf(fp, "%d",&my_array[i]); /*p[i] is the content of element at index i and &p[i] is the address of element at index i */
}
best, so far, code to make the procedure in 1 second
到目前为止,最好的代码,使程序在1秒内完成
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <time.h>
int is_end(char* input) {
return *input == 0;
}
int is_linebreak(char* input) {
return *input == '\r' || *input == '\n' || *input == ' ';
}
char* eat_linebreaks(char* input) {
while (is_linebreak(input))
++input;
return input;
}
size_t count_lines(char* input) {
char* p = input;
size_t rows = 1;
if (is_end(p))
return 0;
while (!is_end(p)) {
if (is_linebreak(p)) {
++rows;
p = eat_linebreaks(p);
}
else {
++p;
}
}
return rows;
}
/* split string by lines */
char** get_lines(char* input, size_t line_count) {
char* p = input;
char* from = input;
size_t length = 0;
size_t line = 0;
int i;
char** lines = (char**)malloc(line_count * sizeof(char*));
do {
if (is_end(p) || is_linebreak(p)) {
lines[line] = (char*)malloc(length + 1);
for (i = 0; i < length; ++i)
lines[line][i] = *(from + i);
lines[line][length] = 0;
length = 0;
++line;
p = eat_linebreaks(p);
from = p;
}
else {
++length;
++p;
}
} while (!is_end(p));
// Copy the last line as well in case the input doesn't end in line-break
lines[line] = (char*)malloc(length + 1);
for (i = 0; i < length; ++i)
lines[line][i] = *(from + i);
lines[line][length] = 0;
++line;
return lines;
}
int main(int argc, char* argv[]) {
clock_t start;
unsigned long microseconds;
float seconds;
char** lines;
size_t size;
size_t number_of_rows;
int count;
int* my_array;
start = clock();
FILE *stream;
char *contents;
int fileSize = 0;
int i;
// Open file, find the size of it
stream = fopen(argv[1], "rb");
fseek(stream, 0L, SEEK_END);
fileSize = ftell(stream);
fseek(stream, 0L, SEEK_SET);
// Allocate space for the entire file content
contents = (char*)malloc(fileSize + 1);
// Stream file into memory
size = fread(contents, 1, fileSize, stream);
contents[size] = 0;
fclose(stream);
// Count rows in content
number_of_rows = count_lines(contents);
// Get array of char*, one for each line
lines = get_lines(contents, number_of_rows);
// Get the numbers out of the lines
count = atoi(lines[0]); // First row has count
my_array = (int*)malloc(count * sizeof(int));
for (i = 0; i < count; ++i) {
my_array[i] = atoi(lines[i + 1]);
}
microseconds = clock() - start;
seconds = microseconds / 1000000.0f;
printf("Took %fs", seconds);
return 0;
}
3 个解决方案
#1
1
First of all, you will want to use fgets
instead to avoid dangerous buffer overflows. Second, you want to remove all punctuation from your numbers. Thus 2.000.000 becomes 2000000. Then you can use pointers and the strtol
function to convert characters to integers; there are also other functions to convert to float
s and other types.
首先,您将需要使用fgets来避免危险的缓冲区溢出。其次,您要删除数字中的所有标点符号。因此2.000.000变为2000000.然后你可以使用指针和strtol函数将字符转换为整数;还有其他函数可以转换为浮点数和其他类型。
#2
1
Since code wants speed and IO is a typically bottle-neck, reading the entire file at once after using fstat()
to find its length (@Charlon) makes some sense. Following is a quick parsing of that buffer.
由于代码需要速度和IO通常是瓶颈,因此在使用fstat()查找其长度(@Charlon)之后立即读取整个文件是有道理的。以下是该缓冲区的快速解析。
// Stream file into memory
size = fread(contents, 1, fileSize, stream);
contents[size] = 0;
fclose(stream);
#if 1
// new code
size_t array_n;
int n;
if (sscanf(contents, "%zu%n", &array_n, &n) != 1) Handle_BadInput();
my_array = malloc(array_n * sizeof *my_array);
if (my_array == NULL) Handle_OOM();
char *p = &contents[n];
errno = 0;
char *endptr;
for (size_t count = 0; count < array_n; count++) {
my_array[count] = strtol(p, &endptr, 10);
if (p == endptr || errno)
Handle_BadInput();
p = endptr;
}
char ch;
if (sscanf(p, " %c", &ch) == 1) Handle_ExtraInput();
#else
//old code
// Count rows in content
number_of_rows = count_lines(contents);
// Get array of char*, one for each line
lines = get_lines(contents, number_of_rows);
// Get the numbers out of the lines
count = atoi(lines[0]); // First row has count
my_array = (int*)malloc(count * sizeof(int));
for (i = 0; i < count; ++i) {
my_array[i] = atoi(lines[i + 1]);
}
#endif
Still prefer the scale-able approach of reading one number at a time.
仍然更喜欢一次读取一个数字的可扩展方法。
#3
0
The fastest way needs a lot of RAM :
最快的方法需要大量的RAM:
1) open the file (man open)
1)打开文件(man open)
2) use the fstat function to get the size of you file (man fstat)
2)使用fstat函数获取你文件的大小(man fstat)
3) read the file with a buffer malloc-ed with the size you just get at 2) (man malloc)
3)用缓冲区malloc-ed读取文件,其大小只是2)(man malloc)
4) close the file (man close)
4)关闭文件(man close)
5) parse your buffer and transform each block of digits (each time until ' ' or '\0') to int
5)解析缓冲区并将每个数字块(每次直到''或'\ 0')转换为int
EDIT : if your RAM is not enough large, you need to create a get_next_int function that only stores in your buffer the next number in the file
编辑:如果你的RAM不够大,你需要创建一个get_next_int函数,它只在你的缓冲区中存储文件中的下一个数字
EDIT 2 : You can read until you know the number of int you will need to store and compares this number with a security coef to the size of your ram, and use the good way so that your program won't set errno to ENOMEM if you know what I'm talking about ;)
编辑2:你可以阅读,直到你知道你需要存储的int数量,并将这个数字与你的ram大小的安全系数进行比较,并使用好方法,这样你的程序就不会将errno设置为ENOMEM你懂我在说什么 ;)
#1
1
First of all, you will want to use fgets
instead to avoid dangerous buffer overflows. Second, you want to remove all punctuation from your numbers. Thus 2.000.000 becomes 2000000. Then you can use pointers and the strtol
function to convert characters to integers; there are also other functions to convert to float
s and other types.
首先,您将需要使用fgets来避免危险的缓冲区溢出。其次,您要删除数字中的所有标点符号。因此2.000.000变为2000000.然后你可以使用指针和strtol函数将字符转换为整数;还有其他函数可以转换为浮点数和其他类型。
#2
1
Since code wants speed and IO is a typically bottle-neck, reading the entire file at once after using fstat()
to find its length (@Charlon) makes some sense. Following is a quick parsing of that buffer.
由于代码需要速度和IO通常是瓶颈,因此在使用fstat()查找其长度(@Charlon)之后立即读取整个文件是有道理的。以下是该缓冲区的快速解析。
// Stream file into memory
size = fread(contents, 1, fileSize, stream);
contents[size] = 0;
fclose(stream);
#if 1
// new code
size_t array_n;
int n;
if (sscanf(contents, "%zu%n", &array_n, &n) != 1) Handle_BadInput();
my_array = malloc(array_n * sizeof *my_array);
if (my_array == NULL) Handle_OOM();
char *p = &contents[n];
errno = 0;
char *endptr;
for (size_t count = 0; count < array_n; count++) {
my_array[count] = strtol(p, &endptr, 10);
if (p == endptr || errno)
Handle_BadInput();
p = endptr;
}
char ch;
if (sscanf(p, " %c", &ch) == 1) Handle_ExtraInput();
#else
//old code
// Count rows in content
number_of_rows = count_lines(contents);
// Get array of char*, one for each line
lines = get_lines(contents, number_of_rows);
// Get the numbers out of the lines
count = atoi(lines[0]); // First row has count
my_array = (int*)malloc(count * sizeof(int));
for (i = 0; i < count; ++i) {
my_array[i] = atoi(lines[i + 1]);
}
#endif
Still prefer the scale-able approach of reading one number at a time.
仍然更喜欢一次读取一个数字的可扩展方法。
#3
0
The fastest way needs a lot of RAM :
最快的方法需要大量的RAM:
1) open the file (man open)
1)打开文件(man open)
2) use the fstat function to get the size of you file (man fstat)
2)使用fstat函数获取你文件的大小(man fstat)
3) read the file with a buffer malloc-ed with the size you just get at 2) (man malloc)
3)用缓冲区malloc-ed读取文件,其大小只是2)(man malloc)
4) close the file (man close)
4)关闭文件(man close)
5) parse your buffer and transform each block of digits (each time until ' ' or '\0') to int
5)解析缓冲区并将每个数字块(每次直到''或'\ 0')转换为int
EDIT : if your RAM is not enough large, you need to create a get_next_int function that only stores in your buffer the next number in the file
编辑:如果你的RAM不够大,你需要创建一个get_next_int函数,它只在你的缓冲区中存储文件中的下一个数字
EDIT 2 : You can read until you know the number of int you will need to store and compares this number with a security coef to the size of your ram, and use the good way so that your program won't set errno to ENOMEM if you know what I'm talking about ;)
编辑2:你可以阅读,直到你知道你需要存储的int数量,并将这个数字与你的ram大小的安全系数进行比较,并使用好方法,这样你的程序就不会将errno设置为ENOMEM你懂我在说什么 ;)