Collection模块

一、nametuple--factory function for creating tuple subclasses with named fields

　　创建类似于元祖的数据类型，除了能够用索引来访问数据，能够迭代，更能够方便的通过属性名来访问数据。

　　示例：

from collections import namedtuple

Friend=namedtuple("Friend",['name','age','email']) # 相当于定义了一个类，类中有三个属性

f1=Friend('xiaowang',33,'xiaowang@163.com') # 创建对象

print(f1)

print(f1.age)

print(f1.email)

f2=Friend(name='xiaozhang',email='xiaozhang@sina.com',age=30)

print(f2)

name,age,email=f2 # 元组的解包

print(name,age,email)

　　但是，它又保持着元组元素的不可变型！

f1.age = 25

Traceback (most recent call last):

 File "<stdin>", line 1, in <module>

AttributeError: can't set attribute

　　对nametuple对象进行处理：

from collections import namedtuple

Friend = namedtuple("Friend", ['name', 'age', 'email'])  # 相当于定义了一个类，类中有三个属性

f1 = Friend('xiaowang', 33, 'xiaowang@163.com')  # 创建对象

f2 = Friend(name='xiaozhang', email='xiaozhang@sina.com', age=30)

name, age, email = f2

print(f1._asdict())  # 转化为有序字典 OrderedDict([('name', 'xiaowang'), ('age', 33), ('email', 'xiaowang@163.com')])

print(dict(f1._asdict())) # 然后就能转化为字典

print(f1._make(['alex','222@222.com',40])) # Friend(name='alex', age='222@222.com', email=40)

　　如果想要修改nametuple对象的某个属性值：

from collections import namedtuple

Friend = namedtuple("Friend", ['name', 'age', 'email'])  # 相当于定义了一个类，类中有三个属性

f1 = Friend('xiaowang', 33, 'xiaowang@163.com')  # 创建对象

f2 = Friend(name='xiaozhang', email='xiaozhang@sina.com', age=30)

print(f1._replace(name='shangsan'))

def namedtuple(typename, field_names, *, verbose=False, rename=False, module=None):

    """Returns a new subclass of tuple with named fields.

    """

    if isinstance(field_names, str):

        field_names = field_names.replace(',', ' ').split()

    field_names = list(map(str, field_names))

    typename = str(typename)

    if rename:

        seen = set()

        for index, name in enumerate(field_names):

            if (not name.isidentifier()

                or _iskeyword(name)

                or name.startswith('_')

                or name in seen):

                field_names[index] = '_%d' % index

            seen.add(name)

    for name in [typename] + field_names:

        if type(name) is not str:

            raise TypeError('Type names and field names must be strings')

        if not name.isidentifier():

            raise ValueError('Type names and field names must be valid '

                             'identifiers: %r' % name)

        if _iskeyword(name):

            raise ValueError('Type names and field names cannot be a '

                             'keyword: %r' % name)

    seen = set()

    for name in field_names:

        if name.startswith('_') and not rename:

            raise ValueError('Field names cannot start with an underscore: '

                             '%r' % name)

        if name in seen:

            raise ValueError('Encountered duplicate field name: %r' % name)

        seen.add(name)

    # Fill-in the class template

    class_definition = _class_template.format(

        typename = typename,

        field_names = tuple(field_names),

        num_fields = len(field_names),

        arg_list = repr(tuple(field_names)).replace("'", "")[1:-1],

        repr_fmt = ', '.join(_repr_template.format(name=name)

                             for name in field_names),

        field_defs = '\n'.join(_field_template.format(index=index, name=name)

                               for index, name in enumerate(field_names))

    )

    namespace = dict(__name__='namedtuple_%s' % typename)

    exec(class_definition, namespace)

    result = namespace[typename]

    result._source = class_definition

    if verbose:

        print(result._source)

    if module is None:

        try:

            module = _sys._getframe(1).f_globals.get('__name__', '__main__')

        except (AttributeError, ValueError):

            pass

    if module is not None:

        result.__module__ = module

    return result

Nametuple源码

二、deque-list-like container with fast appends and pops on either end

　　deque 是 double-ended queue的缩写，类似于 list，不过提供了在两端插入和删除的操作。　

queue = deque()

# append values to wait for processing

queue.appendleft("first")

queue.appendleft("second")

queue.appendleft("third")

# pop values when ready

process(queue.pop()) # would process "first"

# add values while processing

queue.appendleft("fourth")

# what does the queue look like now?

queue # deque(['fourth', 'third', 'second'])

　　1.append/appendleft/extend/extendleft

from collections import collections

d1 = collections.deque()

d1.extend('abcdefg')

print 'extend:', d1

d1.append('h')

print 'append:', d1

d2 = collections.deque()

d2.extendleft(xrange(6))

print 'extendleft', d2

d2.appendleft(6)

print 'appendleft', d2

　　2.pop/popleft

from collections import collections

print "From the right:"

d = collections.deque('abcdefg')

while True:

 try:

  print d.pop(),

 except IndexError:

  break

print

print "\nFrom the left:"

d = collections.deque(xrange(6))

while True:

 try:

  print d.popleft(),

 except IndexError:

  break

print

　　由于双端队列是线程安全的，可以在不同的线程中同时从两端利用队列的内容。

import collections

import threading

import time

candle = collections.deque(xrange(5))

def burn(direction, nextSource):

 while True:

  try:

   next = nextSource()

  except IndexError:

   break

  else:

   print '%8s: %s' % (direction, next)

   time.sleep(0.1)

 print '%8s done' % direction

 return

left = threading.Thread(target=burn, args=('Left', candle.popleft))

right = threading.Thread(target=burn, args=('Right', candle.pop))

left.start()

right.start()

left.join()

right.join()

　　3.rotate

import collections

d = collections.deque(xrange(10))

print 'Normal:', d

d= collections.deque(xrange(10))

d.rotate(2)

print 'Right roration:', d

d = collections.deque(xrange(10))

d.rotate(-2)

print 'Left roration:', d

三、Counter-dict subclass for counting hashable objects

　　Counter类的目的是用来跟踪值出现的次数。它是一个无序的容器类型，以字典的键值对形式存储，其中元素作为key，其计数作为value。

　　1.创建

from collections import Counter

c1 = Counter() # 创建一个空的Counter

c2 = Counter('glad to see you')

c3 = Counter([1,1,1,1,2,3,4,1,2,3])

c4 = Counter({'a':4,"b":2,})

c5 = Counter(a=4,b=2)

　　2.计数值的访问与缺失的键

　　当所访问的键不存在时，返回0，而不是KeyError；否则返回它的计数。

from collections import Counter

c = Counter('glad to see you')

print(c['t']) #

print(c['h']) #

　　3.计数器的更新(update和subtract)

　　可以使用一个iterable对象或者另一个Counter对象来更新键值。

　　计数器的更新包括增加和减少两种：增加(update)，减少(subtract)

　　增加操作：

from collections import Counter

c = Counter('glad to see you')

d = Counter('me too')

c.update(d)

c.update([1,2,3,4]) # update里面可以是任何可迭代对象

print(c)

　　减少操作：

from collections import Counter

c = Counter('glad to see you')

d = Counter('me too')

c.subtract(d)

c.subtract([1,2,3,4])

print(c)

#############

Counter({' ': 2, 'g': 1, 'l': 1, 'a': 1, 'd': 1, 's': 1, 'e': 1, 'y': 1, 'u': 1, 't': 0, 'o': 0, 'm': -1, 1: -1, 2: -1, 3: -1, 4: -1})

　　4.键的删除

　　当计数值为0，并不代表元素被删除，应该使用del来删除数据。

from collections import Counter

c = Counter('glad to see you')

del c['g']

print(c)

　　5.elements()

　　返回一个迭代器。元素被重复了多少次，在该迭代器中就包含多少个该元素。所有元素按照字母序排序，个数小于1的元素不被包含。

from collections import Counter

c = Counter('gladtoseeyou')

print(list(c)) # 取出Counter对象中的键值

print(list(c.elements()))  # 按照个数取出键值

print(list(sorted(c.elements())))  # 取出键值后排序

　　6.most_common(N)

　　返回一个TopN列表。如果n没有被指定，则返回所有元素。当多个元素计数值相同时，按照字母序排列。

from collections import Counter

c = Counter('gladtoseeyou')

print(c.most_common(3)) # 按照从到小取出出现次数最多的前3个元素

    def most_common(self, n=None):

        '''List the n most common elements and their counts from the most

        common to the least.  If n is None, then list all element counts.

        >>> Counter('abcdeabcdabcaba').most_common(3)

        [('a', 5), ('b', 4), ('c', 3)]

        '''

        if n is None:

            return sorted(self.items(), key=_itemgetter(1), reverse=True)

        return _heapq.nlargest(n, self.items(), key=_itemgetter(1))

内部实现：堆排序

　　7.算数和集合操作

　　　+、-、&、|操作也可以用于Counter。其中&和|操作分别返回两个Counter对象各元素的最小值和最大值。

　　需要注意的是，得到的Counter对象将删除小于1的元素。

from collections import Counter

c = Counter(a=1, b=2, c=3)

d = Counter(a=4, b=5)

print(c + d)

print(d - c)

print(c - d)  # 注意：删除小于负数的值

print(c & d)  # 等价于min(c[x],d[x])

print(c | d)  # 等价于max(c[x],d[x])

    def __add__(self, other):

        '''Add counts from two counters.

        >>> Counter('abbb') + Counter('bcc')

        Counter({'b': 4, 'c': 2, 'a': 1})

        '''

        if not isinstance(other, Counter):

            return NotImplemented

        result = Counter()

        for elem, count in self.items():

            newcount = count + other[elem]

            if newcount > 0:

                result[elem] = newcount

        for elem, count in other.items():

            if elem not in self and count > 0:

                result[elem] = count

        return result

    def __sub__(self, other):

        ''' Subtract count, but keep only results with positive counts.

        >>> Counter('abbbc') - Counter('bccd')

        Counter({'b': 2, 'a': 1})

        '''

        if not isinstance(other, Counter):

            return NotImplemented

        result = Counter()

        for elem, count in self.items():

            newcount = count - other[elem]

            if newcount > 0:

                result[elem] = newcount

        for elem, count in other.items():

            if elem not in self and count < 0:

                result[elem] = 0 - count

        return result

    def __or__(self, other):

        '''Union is the maximum of value in either of the input counters.

        >>> Counter('abbb') | Counter('bcc')

        Counter({'b': 3, 'c': 2, 'a': 1})

        '''

        if not isinstance(other, Counter):

            return NotImplemented

        result = Counter()

        for elem, count in self.items():

            other_count = other[elem]

            newcount = other_count if count < other_count else count

            if newcount > 0:

                result[elem] = newcount

        for elem, count in other.items():

            if elem not in self and count > 0:

                result[elem] = count

        return result

    def __and__(self, other):

        ''' Intersection is the minimum of corresponding counts.

        >>> Counter('abbb') & Counter('bcc')

        Counter({'b': 1})

        '''

        if not isinstance(other, Counter):

            return NotImplemented

        result = Counter()

        for elem, count in self.items():

            other_count = other[elem]

            newcount = count if count < other_count else other_count

            if newcount > 0:

                result[elem] = newcount

        return result

实现源码

　　8.常用操作

from collections import Counter

c = Counter("fasdfladfsdasdfsadfadsfwefadscvcxhaegdflffadsfwefnlsdfwefojcc")

sum(c.values())  # 所有计数的总数

c.clear()  # 重置Counter对象，注意不是删除

list(c)  # 将c中的键转为列表

set(c)  # 将c中的键转为set

dict(c)  # 将c中的键值对转为字典

list_of_pairs=c.items()  # 转为(elem, cnt)格式的列表

Counter(dict(list_of_pairs))  # 从(elem, cnt)格式的列表转换为Counter类对象

c.most_common()[:-5:-1]  # 取出计数最少的5个元素

c += Counter()  # 移除0和负值

四、OrderedDict-dict subclass that remembers the order entries were added

　　OrderedDict类型是一个有序的字典,它其实就是比普通字典多了一个顺序。

import collections

dic = collections.OrderedDict()

dic["k1"] = "v1"

dic["k2"] = "v2"

dic["k3"] = "v3"

print(dic)

#实现原理：相当于用列表（有序）来维护字典（无序）排序,以下仅供理解

# dic = {"k1":"v1","k2":"v2"}

# li = ["k1","k2"]

# for i in li:

#     print(dic.get(i))

执行结果：无论执行多少次结果一样

OrderedDict([('k1', 'v1'), ('k2', 'v2'), ('k3', 'v3')])

#有序删除和指定删除

import collections

dic = collections.OrderedDict()

dic["k1"] = "v1"

dic["k2"] = "v2"

dic["k3"] = "v3"

print(dic)

dic.popitem()    #有序拿掉，每次拿掉最后一个，相当于内存的栈存放，后进先出原则，而pop()就是强制拿出指定的值

print(dic)

执行结果：

OrderedDict([('k1', 'v1'), ('k2', 'v2'), ('k3', 'v3')])

OrderedDict([('k1', 'v1'), ('k2', 'v2')])

#把指定键值移到最后

import collections

dic = collections.OrderedDict()

dic["k1"] = "v1"

dic["k2"] = "v2"

dic["k3"] = "v3"

print(dic)

dic.move_to_end("k1")    #把指定键值移到最后

print(dic)

#执行结果：

OrderedDict([('k1', 'v1'), ('k2', 'v2'), ('k3', 'v3')])

OrderedDict([('k2', 'v2'), ('k3', 'v3'), ('k1', 'v1')])

#添加默认键

import collections

dic = collections.OrderedDict()

dic["k1"] = "v1"

dic["k2"] = "v2"

dic["k3"] = "v3"

print(dic)

dic.setdefault("k4","v4")    #默认键值为None,不过可以添加值

print(dic)

#执行结果：

OrderedDict([('k1', 'v1'), ('k2', 'v2'), ('k3', 'v3')])

OrderedDict([('k1', 'v1'), ('k2', 'v2'), ('k3', 'v3'), ('k4', 'v4')])

class OrderedDict(dict):

    'Dictionary that remembers insertion order'

    # An inherited dict maps keys to values.

    # The inherited dict provides __getitem__, __len__, __contains__, and get.

    # The remaining methods are order-aware.

    # Big-O running times for all methods are the same as regular dictionaries.

    # The internal self.__map dict maps keys to links in a doubly linked list.

    # The circular doubly linked list starts and ends with a sentinel element.

    # The sentinel element never gets deleted (this simplifies the algorithm).

    # The sentinel is in self.__hardroot with a weakref proxy in self.__root.

    # The prev links are weakref proxies (to prevent circular references).

    # Individual links are kept alive by the hard reference in self.__map.

    # Those hard references disappear when a key is deleted from an OrderedDict.

    def __init__(*args, **kwds):

        '''Initialize an ordered dictionary.  The signature is the same as

        regular dictionaries, but keyword arguments are not recommended because

        their insertion order is arbitrary.

        '''

        if not args:

            raise TypeError("descriptor '__init__' of 'OrderedDict' object "

                            "needs an argument")

        self, *args = args

        if len(args) > 1:

            raise TypeError('expected at most 1 arguments, got %d' % len(args))

        try:

            self.__root

        except AttributeError:

            self.__hardroot = _Link()

            self.__root = root = _proxy(self.__hardroot)

            root.prev = root.next = root

            self.__map = {}

        self.__update(*args, **kwds)

    def clear(self):

        'od.clear() -> None.  Remove all items from od.'

        root = self.__root

        root.prev = root.next = root

        self.__map.clear()

        dict.clear(self)

    def popitem(self, last=True):

        '''od.popitem() -> (k, v), return and remove a (key, value) pair.

        Pairs are returned in LIFO order if last is true or FIFO order if false.

        '''

        if not self:

            raise KeyError('dictionary is empty')

        root = self.__root

        if last:

            link = root.prev

            link_prev = link.prev

            link_prev.next = root

            root.prev = link_prev

        else:

            link = root.next

            link_next = link.next

            root.next = link_next

            link_next.prev = root

        key = link.key

        del self.__map[key]

        value = dict.pop(self, key)

        return key, value

    def move_to_end(self, key, last=True):

        '''Move an existing element to the end (or beginning if last==False).

        Raises KeyError if the element does not exist.

        When last=True, acts like a fast version of self[key]=self.pop(key).

        '''

        link = self.__map[key]

        link_prev = link.prev

        link_next = link.next

        soft_link = link_next.prev

        link_prev.next = link_next

        link_next.prev = link_prev

        root = self.__root

        if last:

            last = root.prev

            link.prev = last

            link.next = root

            root.prev = soft_link

            last.next = link

        else:

            first = root.next

            link.prev = root

            link.next = first

            first.prev = soft_link

            root.next = link

    def values(self):

        "D.values() -> an object providing a view on D's values"

        return _OrderedDictValuesView(self)

    __ne__ = MutableMapping.__ne__

    __marker = object()

    def pop(self, key, default=__marker):

        '''od.pop(k[,d]) -> v, remove specified key and return the corresponding

        value.  If key is not found, d is returned if given, otherwise KeyError

        is raised.

        '''

        if key in self:

            result = self[key]

            del self[key]

            return result

        if default is self.__marker:

            raise KeyError(key)

        return default

    def setdefault(self, key, default=None):

        'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od'

        if key in self:

            return self[key]

        self[key] = default

        return default

    def copy(self):

        'od.copy() -> a shallow copy of od'

        return self.__class__(self)

源码

　　注： OD 的有序实际上是由一个双向链表实现的。由于 Python 里 list 是可变对象，一个节点 list 里的 PREV 和 NEXT 是对前驱和后继节点 list 的引用。其中，last = root[0] last[1] = root[0] = self.__map[key] = [last, root, key]，实现了在 root 前插入节点。

五、defaultDict-dict subclass that calls a factory function to supply missing values

　　空字典在没有进行初始化之前，是不能通过键来取值的，如果你试图强行取值，会报出异常。而默认字典则是为了解决這个问题。

　　下面看一个例子：

frequencies = {}

for word in wordlist:

    frequencies[word] += 1

　　python会抛出一个KeyError 异常，因为字典索引之前必须初始化，可以用下面的方法解决：

from collections import defaultdict

frequencies = defaultdict(int) #传入int()函数来初始化

for word in wordlist:

    frequencies[word] += 1

　　collections.defaultdict可以接受一个函数作为参数来初始化。

相关文章