如何根据单词作为键分隔符读取文本文件的块?

时间:2021-01-03 21:37:37

I have a .txt file with this format:

我有一个这种格式的.txt文件:

Part #368 - XXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

Part #369 - XXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

Part #370 - XXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

I read the file like this:

我读了这样的文件:

var lines = fs.readFileSync('file.txt', 'utf-8')
.split('\n')
.filter(Boolean);

So it returns an array of the lines of the file. How can I get the chunks of the file starting with the "Part" string?

所以它返回文件行的数组。如何从“Part”字符串开始获取文件的块?

var parts = _.filter(lines,function( s ) { return s.indexOf( 'Part' ) !== -1; });

Something like this but instead of getting the strings starting with "Part" I want all the lines from "Part" string to next "Part" string.

这样的东西,但不是让字符串以“Part”开头,而是希望从“Part”字符串到下一个“Part”字符串的所有行。

2 个解决方案

#1


0  

This creates an array of array of lines.

这将创建一个数组行数组。

var parts = _.reduce(lines, function( result, line ) {
    if (line.indexOf('Part') !== -1) result.push([]);
    _.last(result).push(line);
    return result;
}, []);

#2


0  

JSON Stream

As per @Brad's suggestion, here is a class extended from stream.Transform that delimits the file into a JSON array stream:

根据@Brad的建议,这里是一个从stream.Transform扩展的类,它将文件分隔为JSON数组流:

const { Transform } = require('stream');

class Delimited extends Transform {
  constructor({ delimiter = /\r?\n/g, encoding = 'utf8' } = {}) {
    super();

    // initialize internal values
    this._delimiter = delimiter instanceof RegExp ? delimiter : new RegExp(delimiter, 'g');
    this._encoding = encoding;
    this._buffer = '';
    this._first = true;
  }

  _transform(chunk, encoding, callback) {
    // convert input encoding into output encoding
    // and append to internal buffer
    if (encoding === 'buffer') {
      this._buffer += chunk.toString(this._encoding);
    } else if (encoding === this._encoding) {
      this._buffer += chunk;
    } else {
      this._buffer += Buffer.from(chunk, encoding).toString(this._encoding);
    }

    let partialJSON = '';

    // check if delimiter is found
    if (this._delimiter.test(this._buffer)) {
      // split internal buffer by delimiter
      let sections = this._buffer.split(this._delimiter);
      // put possibly incomplete section from array back into internal buffer
      this._buffer = sections.pop();
      // add each section to partial json array
      sections.forEach(section => {
        partialJSON += `${this._first ? '[' : ','}${JSON.stringify(section)}`;
        this._first = false;
      });
    }

    // push partial json array to readable stream
    callback(null, partialJSON);
  }

  _flush(callback) {
    // add remaining buffer as last section to json array
    callback(null, `${this._first ? '[' : ','}${JSON.stringify(this._buffer)}]`);
  }
}

Example usage:

用法示例:

const fs = require('fs');

let stream = fs.createReadStream('file.txt', 'utf8');
let transform = new Delimited({ delimiter: /\n\n(?=Part #\d)/g });
let json = '';

transform.on('data', (chunk) => json += chunk);
transform.on('end', () => console.log(JSON.parse(json)));

stream.pipe(transform);

Try it online!

在线尝试!

String Stream

Alternatively, if you prefer not to transfer the JSON to another file, process, or as a client response, you can emit each section as a chunk by setting the output stream to objectMode: true:

或者,如果您不希望将JSON传输到另一个文件,进程或作为客户端响应,则可以通过将输出流设置为objectMode来将每个节发送为块:true:

const { Transform } = require('stream');

class Delimited extends Transform {
  constructor(delimiter = /\r?\n/g) {
    super({ objectMode: true });

    // initialize internal values
    this._delimiter = delimiter instanceof RegExp ? delimiter : new RegExp(delimiter, 'g');
    this._encoding = 'utf8';
    this._buffer = '';
    this._first = true;
  }

  _transform(chunk, encoding, callback) {
    // convert input encoding into output encoding
    // and append to internal buffer
    if (encoding === 'buffer') {
      this._buffer += chunk.toString(this._encoding);
    } else if (encoding === this._encoding) {
      this._buffer += chunk;
    } else {
      this._buffer += Buffer.from(chunk, encoding).toString(this._encoding);
    }

    if (this._delimiter.test(this._buffer)) {
      // split internal buffer by delimiter
      let sections = this._buffer.split(this._delimiter);
      // put possibly incomplete section from array back into internal buffer
      this._buffer = sections.pop();
      // push each section to readable stream in object mode
      sections.forEach(this.push, this);
    }

    callback();
  }

  _flush(callback) {
    // push remaining buffer to readable stream
    callback(null, this._buffer);
  }
}

Example usage:

用法示例:

const fs = require('fs');

let stream = fs.createReadStream('file.txt', 'utf8');
let transform = new Delimited(/\n\n(?=Part #\d)/g);
let array = [];

transform.on('data', (chunk) => array.push(chunk));
transform.on('end', () => console.log(array));

stream.pipe(transform);

Try it online!

在线尝试!

#1


0  

This creates an array of array of lines.

这将创建一个数组行数组。

var parts = _.reduce(lines, function( result, line ) {
    if (line.indexOf('Part') !== -1) result.push([]);
    _.last(result).push(line);
    return result;
}, []);

#2


0  

JSON Stream

As per @Brad's suggestion, here is a class extended from stream.Transform that delimits the file into a JSON array stream:

根据@Brad的建议,这里是一个从stream.Transform扩展的类,它将文件分隔为JSON数组流:

const { Transform } = require('stream');

class Delimited extends Transform {
  constructor({ delimiter = /\r?\n/g, encoding = 'utf8' } = {}) {
    super();

    // initialize internal values
    this._delimiter = delimiter instanceof RegExp ? delimiter : new RegExp(delimiter, 'g');
    this._encoding = encoding;
    this._buffer = '';
    this._first = true;
  }

  _transform(chunk, encoding, callback) {
    // convert input encoding into output encoding
    // and append to internal buffer
    if (encoding === 'buffer') {
      this._buffer += chunk.toString(this._encoding);
    } else if (encoding === this._encoding) {
      this._buffer += chunk;
    } else {
      this._buffer += Buffer.from(chunk, encoding).toString(this._encoding);
    }

    let partialJSON = '';

    // check if delimiter is found
    if (this._delimiter.test(this._buffer)) {
      // split internal buffer by delimiter
      let sections = this._buffer.split(this._delimiter);
      // put possibly incomplete section from array back into internal buffer
      this._buffer = sections.pop();
      // add each section to partial json array
      sections.forEach(section => {
        partialJSON += `${this._first ? '[' : ','}${JSON.stringify(section)}`;
        this._first = false;
      });
    }

    // push partial json array to readable stream
    callback(null, partialJSON);
  }

  _flush(callback) {
    // add remaining buffer as last section to json array
    callback(null, `${this._first ? '[' : ','}${JSON.stringify(this._buffer)}]`);
  }
}

Example usage:

用法示例:

const fs = require('fs');

let stream = fs.createReadStream('file.txt', 'utf8');
let transform = new Delimited({ delimiter: /\n\n(?=Part #\d)/g });
let json = '';

transform.on('data', (chunk) => json += chunk);
transform.on('end', () => console.log(JSON.parse(json)));

stream.pipe(transform);

Try it online!

在线尝试!

String Stream

Alternatively, if you prefer not to transfer the JSON to another file, process, or as a client response, you can emit each section as a chunk by setting the output stream to objectMode: true:

或者,如果您不希望将JSON传输到另一个文件,进程或作为客户端响应,则可以通过将输出流设置为objectMode来将每个节发送为块:true:

const { Transform } = require('stream');

class Delimited extends Transform {
  constructor(delimiter = /\r?\n/g) {
    super({ objectMode: true });

    // initialize internal values
    this._delimiter = delimiter instanceof RegExp ? delimiter : new RegExp(delimiter, 'g');
    this._encoding = 'utf8';
    this._buffer = '';
    this._first = true;
  }

  _transform(chunk, encoding, callback) {
    // convert input encoding into output encoding
    // and append to internal buffer
    if (encoding === 'buffer') {
      this._buffer += chunk.toString(this._encoding);
    } else if (encoding === this._encoding) {
      this._buffer += chunk;
    } else {
      this._buffer += Buffer.from(chunk, encoding).toString(this._encoding);
    }

    if (this._delimiter.test(this._buffer)) {
      // split internal buffer by delimiter
      let sections = this._buffer.split(this._delimiter);
      // put possibly incomplete section from array back into internal buffer
      this._buffer = sections.pop();
      // push each section to readable stream in object mode
      sections.forEach(this.push, this);
    }

    callback();
  }

  _flush(callback) {
    // push remaining buffer to readable stream
    callback(null, this._buffer);
  }
}

Example usage:

用法示例:

const fs = require('fs');

let stream = fs.createReadStream('file.txt', 'utf8');
let transform = new Delimited(/\n\n(?=Part #\d)/g);
let array = [];

transform.on('data', (chunk) => array.push(chunk));
transform.on('end', () => console.log(array));

stream.pipe(transform);

Try it online!

在线尝试!