tensorflow 在加载大型的embedding模型参数时，会遇到cannot be larger than 2GB

这种问题是，对于每一个变量 variable 由于是基于protobuf存在这大小限制(2G),这个时候，我们需要将embedding拆开，拆分成N等分，来使得每一个

variable都在2G以下;

 # !/usr/bin/env/python

 # coding=utf-8

 import tensorflow as tf

 import numpy as np

 input_ids = tf.placeholder(dtype=tf.int32, shape=[None,None])

 num_shards = 3

 weights = []

 weights_shape = np.arange(27).reshape(9, 3)

 # assert weights_shape[0] % num_shards == 0

 num_shards_len = (weights_shape.shape[0]) / num_shards

 assert  (weights_shape.shape[0]) % num_shards ==0

 begin_ = 0

 ends_ = num_shards_len

 for i in range(0, num_shards):

     if (i + 1) * num_shards_len < weights_shape.shape[0]:

         begin_ = i * num_shards_len

         if i + 1 == num_shards:

             ends_ = weights_shape.shape[0]

         else:

             ends_ = (i + 1) * num_shards_len

     else:

         begin_ = i * num_shards_len

         ends_ = weights_shape.shape[0]

     weights_i = tf.get_variable("words-%02d" % i,

                                 initializer=tf.constant(weights_shape[begin_: ends_, ]))

     weights.append(weights_i)

 input_embedding = tf.nn.embedding_lookup(weights, input_ids,partition_strategy="div")

 sess = tf.InteractiveSession()

 sess.run(tf.global_variables_initializer())

 print(sess.run(weights))

 print(sess.run(input_embedding, feed_dict={input_ids: [[1, 2], [3, 0], [8, 2], [5, 1]]}))

结果为:

[array([[0, 1, 2],

       [3, 4, 5],

       [6, 7, 8]]), array([[ 9, 10, 11],

       [12, 13, 14],

       [15, 16, 17]]), array([[18, 19, 20],

       [21, 22, 23],

       [24, 25, 26]])]

[[[ 3  4  5]

  [ 6  7  8]]

 [[ 9 10 11]

  [ 0  1  2]]

 [[24 25 26]

  [ 6  7  8]]

 [[15 16 17]

  [ 3  4  5]]]

秒客网

tensorflow 在加载大型的embedding模型参数时，会遇到cannot be larger than 2GB

相关文章