Consider I have these lists:
假设我有以下清单:
l = [5,6,7,8,9,10,5,15,20]
m = [10,5]
I want to get the index of m
in l
. I used list comprehension to do that:
我想在l中得到m的索引,我用list comprehension来表示:
[(i,i+1) for i,j in enumerate(l) if m[0] == l[i] and m[1] == l[i+1]]
Output : [(5,6)]
输出:[(5、6)]
But if I have more numbers in m
, I feel its not the right way. So is there any easy approach in Python or with NumPy?
但是如果我有更多的m,我觉得这不是正确的方法。那么在Python或NumPy中有什么简单的方法吗?
Another example:
另一个例子:
l = [5,6,7,8,9,10,5,15,20,50,16,18]
m = [10,5,15,20]
The output should be:
输出应该是:
[(5,6,7,8)]
2 个解决方案
#1
6
You are basically looking for the starting indices of a list in another list.
您基本上是在另一个列表中寻找列表的起始索引。
Approach #1 : One approach to solve it would be to create sliding windows of the elements in list in which we are searching, giving us a 2D
array and then simply use NumPy broadcasting
to perform broadcasted comparison against the search list against each row of the 2D
sliding window version obtained earlier. Thus, one method would be -
方法# 1:解决这个问题的一个方法是创建滑动窗口的元素列表中搜索,给我们一个二维数组,然后简单地使用NumPy广播执行播放比较针对搜索列表的每一行二维滑动窗口获得的版本。因此,一种方法是-
# strided_app is from https://*.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
out = np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Sample runs -
样本运行-
In [340]: l = [5,6,7,8,9,10,5,15,20,50,16,18]
...: m = [10,5,15,20]
...:
In [341]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[341]: array([5, 6, 7, 8])
In [342]: l = [5,6,7,8,9,10,5,15,20,50,16,18,10,5,15,20]
...: m = [10,5,15,20]
...:
In [343]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[343]:
array([[ 5, 6, 7, 8],
[12, 13, 14, 15]])
Approach #2 : Another method would be to get the sliding window and then get the row-wise scalar view into the data to be search data and the data to be search for, giving us 1D
data to work with, like so -
方法#2:另一种方法是获取滑动窗口,然后将行级标量视图放入数据中作为搜索数据和要搜索的数据,这样就给了我们1D数据
# view1D is from https://*.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
out = np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
#2
5
The easiest way (using pure Python) would be to iterate over the items and first only check if the first item matches. This avoids doing sublist comparisons when not needed. Depending on the contents of your l
this could outperform even NumPy broadcasting solutions:
最简单的方法(使用纯Python)是遍历条目,首先只检查第一个条目是否匹配。这样可以避免在不需要的情况下进行子列表比较。根据你的l的内容,这可能会胜过甚至是NumPy广播解决方案:
def func(haystack, needle): # obviously needs a better name ...
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx:idx+lengthneedle] == needle:
yield tuple(range(idx, idx+lengthneedle))
>>> list(func(l, m))
[(5, 6, 7, 8)]
In case your interested in speed I checked the performance of the approaches (borrowing from my setup here):
如果你对速度感兴趣,我检查了方法的性能(从我这里的设置):
import random
import numpy as np
# strided_app is from https://*.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
# view1D is from https://*.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
def find_sublist_indices(haystack, needle):
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
restneedle = needle[1:]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx+1:idx+lengthneedle] == restneedle:
yield tuple(range(idx, idx+lengthneedle))
def Divakar1(l, m):
return np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
def Divakar2(l, m):
return np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
def MSeifert(l, m):
return list(find_sublist_indices(l, m))
# Timing setup
timings = {Divakar1: [], Divakar2: [], MSeifert: []}
sizes = [2**i for i in range(5, 20, 2)]
# Timing
for size in sizes:
l = [random.randint(0, 50) for _ in range(size)]
m = [random.randint(0, 50) for _ in range(10)]
larr = np.asarray(l)
marr = np.asarray(m)
for func in timings:
# first timings:
# res = %timeit -o func(l, m)
# second timings:
if func is MSeifert:
res = %timeit -o func(l, m)
else:
res = %timeit -o func(larr, marr)
timings[func].append(res)
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(1)
ax = plt.subplot(111)
for func in timings:
ax.plot(sizes,
[time.best for time in timings[func]],
label=str(func.__name__))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('size')
ax.set_ylabel('time [seconds]')
ax.grid(which='both')
ax.legend()
plt.tight_layout()
In case your l
and m
are lists my function outperforms the NumPy solutions for all sizes:
如果你的l和m是列表,我的函数比所有大小的NumPy解决方案都要好:
But in case you have these as numpy arrays you'll get faster results for large arrays (size > 1000 elements) when using Divakars NumPy solutions:
但如果你有这些作为numpy数组,你会得到更快的结果为大型数组(大小> 1000个元素)当使用Divakars numpy解决方案:
#1
6
You are basically looking for the starting indices of a list in another list.
您基本上是在另一个列表中寻找列表的起始索引。
Approach #1 : One approach to solve it would be to create sliding windows of the elements in list in which we are searching, giving us a 2D
array and then simply use NumPy broadcasting
to perform broadcasted comparison against the search list against each row of the 2D
sliding window version obtained earlier. Thus, one method would be -
方法# 1:解决这个问题的一个方法是创建滑动窗口的元素列表中搜索,给我们一个二维数组,然后简单地使用NumPy广播执行播放比较针对搜索列表的每一行二维滑动窗口获得的版本。因此,一种方法是-
# strided_app is from https://*.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
out = np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Sample runs -
样本运行-
In [340]: l = [5,6,7,8,9,10,5,15,20,50,16,18]
...: m = [10,5,15,20]
...:
In [341]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[341]: array([5, 6, 7, 8])
In [342]: l = [5,6,7,8,9,10,5,15,20,50,16,18,10,5,15,20]
...: m = [10,5,15,20]
...:
In [343]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[343]:
array([[ 5, 6, 7, 8],
[12, 13, 14, 15]])
Approach #2 : Another method would be to get the sliding window and then get the row-wise scalar view into the data to be search data and the data to be search for, giving us 1D
data to work with, like so -
方法#2:另一种方法是获取滑动窗口,然后将行级标量视图放入数据中作为搜索数据和要搜索的数据,这样就给了我们1D数据
# view1D is from https://*.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
out = np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
#2
5
The easiest way (using pure Python) would be to iterate over the items and first only check if the first item matches. This avoids doing sublist comparisons when not needed. Depending on the contents of your l
this could outperform even NumPy broadcasting solutions:
最简单的方法(使用纯Python)是遍历条目,首先只检查第一个条目是否匹配。这样可以避免在不需要的情况下进行子列表比较。根据你的l的内容,这可能会胜过甚至是NumPy广播解决方案:
def func(haystack, needle): # obviously needs a better name ...
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx:idx+lengthneedle] == needle:
yield tuple(range(idx, idx+lengthneedle))
>>> list(func(l, m))
[(5, 6, 7, 8)]
In case your interested in speed I checked the performance of the approaches (borrowing from my setup here):
如果你对速度感兴趣,我检查了方法的性能(从我这里的设置):
import random
import numpy as np
# strided_app is from https://*.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
# view1D is from https://*.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
def find_sublist_indices(haystack, needle):
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
restneedle = needle[1:]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx+1:idx+lengthneedle] == restneedle:
yield tuple(range(idx, idx+lengthneedle))
def Divakar1(l, m):
return np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
def Divakar2(l, m):
return np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
def MSeifert(l, m):
return list(find_sublist_indices(l, m))
# Timing setup
timings = {Divakar1: [], Divakar2: [], MSeifert: []}
sizes = [2**i for i in range(5, 20, 2)]
# Timing
for size in sizes:
l = [random.randint(0, 50) for _ in range(size)]
m = [random.randint(0, 50) for _ in range(10)]
larr = np.asarray(l)
marr = np.asarray(m)
for func in timings:
# first timings:
# res = %timeit -o func(l, m)
# second timings:
if func is MSeifert:
res = %timeit -o func(l, m)
else:
res = %timeit -o func(larr, marr)
timings[func].append(res)
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(1)
ax = plt.subplot(111)
for func in timings:
ax.plot(sizes,
[time.best for time in timings[func]],
label=str(func.__name__))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('size')
ax.set_ylabel('time [seconds]')
ax.grid(which='both')
ax.legend()
plt.tight_layout()
In case your l
and m
are lists my function outperforms the NumPy solutions for all sizes:
如果你的l和m是列表,我的函数比所有大小的NumPy解决方案都要好:
But in case you have these as numpy arrays you'll get faster results for large arrays (size > 1000 elements) when using Divakars NumPy solutions:
但如果你有这些作为numpy数组,你会得到更快的结果为大型数组(大小> 1000个元素)当使用Divakars numpy解决方案: