正则表达式:匹配DNA序列的排列

时间:2021-09-24 18:46:07

How do I make a regular expression to evaluate the following string?

如何制作正则表达式来评估以下字符串?

TGATGCCGTCCCCTCAACTTGAGTGCTCCTAATGCGTTGC

and extract the pattern CTCCT.

并提取模式CTCCT。

The pattern must be 3 C's and 2 T's in any order.

模式必须是3 C和2 T的任何顺序。

I tried /[C | T]{5}/ but it matches CCCCT and TCCCC

我试过/ [C | T] {5} /但它与CCCCT和TCCCC相匹配

Thanks in Advance.

提前致谢。

2 个解决方案

#1


2  

Compute all permutations of "CTCCT" and concatenate them to a regex:

计算“CTCCT”的所有排列并将它们连接到正则表达式:

CCCTT|CCTCT|CCTTC|CTCCT|CTCTC|CTTCC|TCCCT|TCCTC|TCTCC|TTCCC

This pattern can be optimized:

这种模式可以优化:

C(?:C(?:T(?:CT|TC)|CTT)|T(?:C(?:CT|TC)|TCC))|T(?:C(?:C(?:CT|TC)|TCC)|TCCC)

var regex = new RegExp(/C(?:C(?:T(?:CT|TC)|CTT)|T(?:C(?:CT|TC)|TCC))|T(?:C(?:C(?:CT|TC)|TCC)|TCCC)/g);

var string = "TGATGCCGTCCCCTCAACTTGAGTGCTCCTAATGCGTTGC";

console.log(regex.exec(string));

This pattern doesn't find overlapping matches, e. g. there would only be one match in CCCTTCCC.

此模式未找到重叠匹配,例如: G。在CCCTTCCC中只会有一场比赛。

To find overlapping matches, use lookahead:

要查找重叠匹配,请使用前瞻:

C(?=C(?=T(?=CT|TC)|CTT)|T(?=C(?=CT|TC)|TCC))|T(?=C(?=C(?=CT|TC)|TCC)|TCCC)

var regex = new RegExp(/C(?=C(?=T(?=CT|TC)|CTT)|T(?=C(?=CT|TC)|TCC))|T(?=C(?=C(?=CT|TC)|TCC)|TCCC)/g);

var string = "CCCTTCCC";

while ((match = regex.exec(string)) != null) {
    console.log(match.index, string.substring(match.index, match.index + 5));
}

Regex can only deal with a fairly limited number of permutations. If you want to match segments of possibly arbitrary size, use a non-regex solution:

正则表达式只能处理相当有限数量的排列。如果要匹配可能任意大小的段,请使用非正则表达式解决方案:

function c3t2_optimized(str) {
  var c = 0, t = 0;
  for (var i = 0; i < str.length; ++i) {
    var last = str.charAt(i);
    if (last == 'C') ++c;
    else if (last == 'T') ++t;
    if (i > 4) {
      var first = str.charAt(i - 5);
      if (first == 'C') --c;
      else if (first == 'T') --t;
    }
    if (c == 3 && t == 2) return i - 4;
  }
  return -1;
}

var string = "TGATGCCGTCCCCTCAACTTGAGTGCTCCTAATGCGTTGC";
      
console.log(c3t2_optimized(string));

Or the same as above, just as a generator stepping through all possibly overlapping matches:

或者与上面相同,就像一个生成器逐步完成所有可能重叠的匹配:

function* c3t2_optimized(str) {
  var c = 0, t = 0;
  for (var i = 0; i < str.length; ++i) {
    var last = str.charAt(i);
    if (last == 'C') ++c;
    else if (last == 'T') ++t;
    if (i > 4) {
      var first = str.charAt(i - 5);
      if (first == 'C') --c;
      else if (first == 'T') --t;
    }
    if (c == 3 && t == 2) yield i - 4;
  }
}

var string = "CCCTTCCC";

for (i of c3t2_optimized(string)) {
  console.log(i, string.substring(i, i + 5));
}

Performance comparison: https://jsfiddle.net/24qguege/7/

性能比较:https://jsfiddle.net/24qguege/7/

Firefox 47:

  • 68.83ms - regex (see above)
  • 68.83ms - 正则表达式(见上文)

  • 97.51ms - non-regex (see above)
  • 97.51ms - 非正则表达式(见上文)

  • 9582.39ms - Andrew Rueckert's answer (better readability)
  • 9582.39ms - Andrew Rueckert的回答(更好的可读性)

#2


3  

This isn't the type of problem that is easily solved using Regular Expressions. It can be solved fairly straighforwardly with a simple function, however

这不是使用正则表达式轻松解决的问题类型。然而,它可以通过简单的功能相当直接地解决

 function c3t2(str) {
  var lowerCaseStr = str.toLowerCase();
  for (index = 0; index + 5 <= str.length; index++) {
    var substring = lowerCaseStr.substring(index, index + 5);
    var chars = substring.split("");
    if (chars.sort().join("") === "ccctt") {
      return index;
    }
  }

  return false;
}

#1


2  

Compute all permutations of "CTCCT" and concatenate them to a regex:

计算“CTCCT”的所有排列并将它们连接到正则表达式:

CCCTT|CCTCT|CCTTC|CTCCT|CTCTC|CTTCC|TCCCT|TCCTC|TCTCC|TTCCC

This pattern can be optimized:

这种模式可以优化:

C(?:C(?:T(?:CT|TC)|CTT)|T(?:C(?:CT|TC)|TCC))|T(?:C(?:C(?:CT|TC)|TCC)|TCCC)

var regex = new RegExp(/C(?:C(?:T(?:CT|TC)|CTT)|T(?:C(?:CT|TC)|TCC))|T(?:C(?:C(?:CT|TC)|TCC)|TCCC)/g);

var string = "TGATGCCGTCCCCTCAACTTGAGTGCTCCTAATGCGTTGC";

console.log(regex.exec(string));

This pattern doesn't find overlapping matches, e. g. there would only be one match in CCCTTCCC.

此模式未找到重叠匹配,例如: G。在CCCTTCCC中只会有一场比赛。

To find overlapping matches, use lookahead:

要查找重叠匹配,请使用前瞻:

C(?=C(?=T(?=CT|TC)|CTT)|T(?=C(?=CT|TC)|TCC))|T(?=C(?=C(?=CT|TC)|TCC)|TCCC)

var regex = new RegExp(/C(?=C(?=T(?=CT|TC)|CTT)|T(?=C(?=CT|TC)|TCC))|T(?=C(?=C(?=CT|TC)|TCC)|TCCC)/g);

var string = "CCCTTCCC";

while ((match = regex.exec(string)) != null) {
    console.log(match.index, string.substring(match.index, match.index + 5));
}

Regex can only deal with a fairly limited number of permutations. If you want to match segments of possibly arbitrary size, use a non-regex solution:

正则表达式只能处理相当有限数量的排列。如果要匹配可能任意大小的段,请使用非正则表达式解决方案:

function c3t2_optimized(str) {
  var c = 0, t = 0;
  for (var i = 0; i < str.length; ++i) {
    var last = str.charAt(i);
    if (last == 'C') ++c;
    else if (last == 'T') ++t;
    if (i > 4) {
      var first = str.charAt(i - 5);
      if (first == 'C') --c;
      else if (first == 'T') --t;
    }
    if (c == 3 && t == 2) return i - 4;
  }
  return -1;
}

var string = "TGATGCCGTCCCCTCAACTTGAGTGCTCCTAATGCGTTGC";
      
console.log(c3t2_optimized(string));

Or the same as above, just as a generator stepping through all possibly overlapping matches:

或者与上面相同,就像一个生成器逐步完成所有可能重叠的匹配:

function* c3t2_optimized(str) {
  var c = 0, t = 0;
  for (var i = 0; i < str.length; ++i) {
    var last = str.charAt(i);
    if (last == 'C') ++c;
    else if (last == 'T') ++t;
    if (i > 4) {
      var first = str.charAt(i - 5);
      if (first == 'C') --c;
      else if (first == 'T') --t;
    }
    if (c == 3 && t == 2) yield i - 4;
  }
}

var string = "CCCTTCCC";

for (i of c3t2_optimized(string)) {
  console.log(i, string.substring(i, i + 5));
}

Performance comparison: https://jsfiddle.net/24qguege/7/

性能比较:https://jsfiddle.net/24qguege/7/

Firefox 47:

  • 68.83ms - regex (see above)
  • 68.83ms - 正则表达式(见上文)

  • 97.51ms - non-regex (see above)
  • 97.51ms - 非正则表达式(见上文)

  • 9582.39ms - Andrew Rueckert's answer (better readability)
  • 9582.39ms - Andrew Rueckert的回答(更好的可读性)

#2


3  

This isn't the type of problem that is easily solved using Regular Expressions. It can be solved fairly straighforwardly with a simple function, however

这不是使用正则表达式轻松解决的问题类型。然而,它可以通过简单的功能相当直接地解决

 function c3t2(str) {
  var lowerCaseStr = str.toLowerCase();
  for (index = 0; index + 5 <= str.length; index++) {
    var substring = lowerCaseStr.substring(index, index + 5);
    var chars = substring.split("");
    if (chars.sort().join("") === "ccctt") {
      return index;
    }
  }

  return false;
}