1、Replicator运行代码具体分析

上篇问中介绍了启动Replicator的详细过程，以下解说Replicator的运行代码的详细实现，首先看replicate方法：

def replicate(self, override_devices=None, override_partitions=None):

        """Run a replication pass"""

        self.start = time.time()

        self.suffix_count = 0

        self.suffix_sync = 0

        self.suffix_hash = 0

        self.replication_count = 0

        self.last_replication_count = -1

        self.partition_times = []

        if override_devices is None:

            override_devices = []

        if override_partitions is None:

            override_partitions = []

        #heartbeat 为心跳函数 依据配置，配置没有 默觉得 300

        stats = eventlet.spawn(self.heartbeat)

        #detect_lockup  检查死锁

        lockup_detector = eventlet.spawn(self.detect_lockups)

        eventlet.sleep()  # Give spawns a cycle

        try:

            #replication 的 woker 数量

            self.run_pool = GreenPool(size=self.concurrency)

            # Returns a sorted list of jobs (dictionaries) that specify the

            # partitions, nodes, etc to be synced.

            # 返回专门为分区，节点同步工作的排序的列表

            #

            jobs = self.collect_jobs()

            for job in jobs:

                #重写设备

                if override_devices and job['device'] not in override_devices:

                    continue

                #重写分区

                if override_partitions and \

                        job['partition'] not in override_partitions:

                    continue

                #假设重写设备及其重写分区在job 中

                dev_path = join(self.devices_dir, job['device'])

                if self.mount_check and not ismount(dev_path):

                    self.logger.warn(_('%s is not mounted'), job['device'])

                    continue

                #ring没有改变

                if not self.check_ring():

                    self.logger.info(_("Ring change detected. Aborting "

                                       "current replication pass."))

                    return

                #假设

                if job['delete']:

                    self.run_pool.spawn(self.update_deleted, job)

                else:

                    #运行的是更新

                    self.run_pool.spawn(self.update, job)

            with Timeout(self.lockup_timeout):

                self.run_pool.waitall()

        except (Exception, Timeout):

            self.logger.exception(_("Exception in top-level replication loop"))

            self.kill_coros()

        finally:

            stats.kill()

            lockup_detector.kill()

            self.stats_line()

在replicate方法中，首先是为replicate方法运行的准备工作，当中最重要的是要收集要运行的job的collection_jobs方法，以下为其代码的详细实现：

def collect_jobs(self):

        """

        Returns a sorted list of jobs (dictionaries) that specify the

        partitions, nodes, etc to be synced.

        """

        jobs = []

        ips = whataremyips()

        #replication_ip 和replication_port 在  RingBuilder中 load加入

        #self.object_ring = Ring(self.swift_dir, ring_name='object')

        for local_dev in [dev for dev in self.object_ring.devs

                          if dev and dev['replication_ip'] in ips and

                          dev['replication_port'] == self.port]:

            dev_path = join(self.devices_dir, local_dev['device'])

            obj_path = join(dev_path, 'objects')

            tmp_path = join(dev_path, 'tmp')

            if self.mount_check and not ismount(dev_path):

                self.logger.warn(_('%s is not mounted'), local_dev['device'])

                continue

        #Remove any file in a given path that that was last modified before mtime.

        #/srv/1/node/sdb1/tmp下的文件

            unlink_older_than(tmp_path, time.time() - self.reclaim_age)

            if not os.path.exists(obj_path):

                try:

                    mkdirs(obj_path)

                except Exception:

                    self.logger.exception('ERROR creating %s' % obj_path)

                continue

            #root@kinglion-Lenovo-Product:/srv/1/node/sdb1/objects# ls

            #13069  133971  4799  58208  94238

            for partition in os.listdir(obj_path):

                try:

                    job_path = join(obj_path, partition)

                    #推断当前路径是否为文件，假设是文件则删除

                    if isfile(job_path):

                        #

                        # Clean up any (probably zero-byte) files where a

                        # partition should be.

                        self.logger.warning('Removing partition directory '

                                            'which was a file: %s', job_path)

                        os.remove(job_path)

                        continue

                    #获得每一个partion相应的设备

                    part_nodes = \

                        self.object_ring.get_part_nodes(int(partition))

                    #nodes为不是本机器nodes的其它replica-1个nodes

                    nodes = [node for node in part_nodes

                             if node['id'] != local_dev['id']]

                    #对objects下全部partion遍历，故有jobs的长度最大为_replica2part2dev分区备份中出现此设备有此设备id的分区和

                    jobs.append(

                        dict(path=job_path,

                             device=local_dev['device'],

                             nodes=nodes,

                             #len（nodes）>len(part_nodes)-1的情况是当前节点已经不再是 当前partition所相应的设备了，有可能删除了该设备

                             delete=len(nodes) > len(part_nodes) - 1,

                             partition=partition))

                except (ValueError, OSError):

                    continue

        #打乱顺序

        random.shuffle(jobs)

        if self.handoffs_first:

            # Move the handoff parts to the front of the list

            #将handoff 节点移到jobs队列的前边

            jobs.sort(key=lambda job: not job['delete'])

        self.job_count = len(jobs)

        return jobs

对于第二层for循环，os.listdir(obj_path)列出objects目录下的全部partion，创建object是在objects目录下创建objects所映射的分区号的文件件，再在partion目录下创建以object的hash值后三位为名称的目录，然后再在后缀目录下创建以object的hash值为目录名的目录，object会存储为以object上传时间戳为名.data为文件后缀的文件。通过理解一致性hash算法可知，增加虚拟节点后每个设备会多个虚拟节点和其相应，假设一个设备相应的分区为n则，obj_path下子目录数目会<=n,由于存入的全部文件并不一定都能映射到当前设备所相应的分区。for循环首先判读obj_path下是否为文件，若是文件则删除，若不是则获得该分区号，依据分区号获得该分区号所映射的三个备份设备，并将设备id和本地设备id不想等的增加到nodes中，将nodes、path等信息增加到jobs中，最后打乱jobs的顺序，再将handoff
节点移到队列前边。返回jobs。再到replicate方法，首先我们看job[delete]为False的情况。当job[delete]为False会运行update方法，下边看update方法的详细实现：

def update(self, job):

        """

        High-level method that replicates a single partition.

        :param job: a dict containing info about the partition to be replicated

        """

        self.replication_count += 1

        self.logger.increment('partition.update.count.%s' % (job['device'],))

        begin = time.time()

        try:

            #get_hashes 从hashes.pkl获取hashes值并更新 获取本地的hashes job[path] 为 job_path = join(obj_path, partition) local_hash为hashes.pkl中的反序列化回来的内容 hashed为改变的

            hashed, local_hash = tpool_reraise(

                get_hashes, job['path'],

                do_listdir=(self.replication_count % 10) == 0,

                reclaim_age=self.reclaim_age)

            self.suffix_hash += hashed

            self.logger.update_stats('suffix.hashes', hashed)

            #

            attempts_left = len(job['nodes'])

            #此时的nodes为除去本节点外的全部节点 由于 job['nodes]不包括本地节点get_more_nodes(int(job['partition']))能获得除去本partion所相应节点 外的其它全部节点

            nodes = itertools.chain(

                job['nodes'],

                self.object_ring.get_more_nodes(int(job['partition'])))

           #此时attempts_left 为2 若果replica为3

            while attempts_left > 0:

                # If this throws StopIterator it will be caught way below

                node = next(nodes)

                attempts_left -= 1

                try:

                    with Timeout(self.http_timeout):

                        #REPLICARE方法 相应 sever里面的RELICATE方法

                        resp = http_connect(

                            node['replication_ip'], node['replication_port'],

                            node['device'], job['partition'], 'REPLICATE',

                            '', headers=self.headers).getresponse()

                        if resp.status == HTTP_INSUFFICIENT_STORAGE:

                            self.logger.error(_('%(ip)s/%(device)s responded'

                                                ' as unmounted'), node)

                            attempts_left += 1

                            continue

                        if resp.status != HTTP_OK:

                            self.logger.error(_("Invalid response %(resp)s "

                                                "from %(ip)s"),

                                              {'resp': resp.status,

                                               'ip': node['replication_ip']})

                            continue

                        #remote_hash 为 请求 'REPLICATE 返回的

                        remote_hash = pickle.loads(resp.read())

                        del resp

                    #找出本地后缀和远程后缀不同的

                    suffixes = [suffix for suffix in local_hash if

                                local_hash[suffix] !=

                                remote_hash.get(suffix, -1)]

                    #假设没有说明没有变动，则继续请求下一个节点

                    if not suffixes:

                        continue

                    #效果就是运行get_hashes方法

                    hashed, recalc_hash = tpool_reraise(

                        get_hashes,

                        job['path'], recalculate=suffixes,

                        reclaim_age=self.reclaim_age)

                    self.logger.update_stats('suffix.hashes', hashed)

                    local_hash = recalc_hash

                    #假如 local_hash 为 123 321 122 remote_hash 123 321 124 则 122为变化的

                    #文件路径hash值后三位会不会反复

                    suffixes = [suffix for suffix in local_hash if

                                local_hash[suffix] !=

                                remote_hash.get(suffix, -1)]

                    #找到了不同的并知道其节点则将其同步到相应的节点，是基于推送模式的，故传的数据是自己本地的数据

                    self.sync(node, job, suffixes)  #同步变化的

                    with Timeout(self.http_timeout):

                        conn = http_connect(

                            node['replication_ip'], node['replication_port'],

                            node['device'], job['partition'], 'REPLICATE',

                            '/' + '-'.join(suffixes),

                            headers=self.headers)

                        conn.getresponse().read()

                    self.suffix_sync += len(suffixes)

                    self.logger.update_stats('suffix.syncs', len(suffixes))

                except (Exception, Timeout):

                    self.logger.exception(_("Error syncing with node: %s") %

                                          node)

            #后缀数量 写日志时会用到

            self.suffix_count += len(local_hash)

        except (Exception, Timeout):

            self.logger.exception(_("Error syncing partition"))

        finally:

            self.partition_times.append(time.time() - begin)

            self.logger.timing_since('partition.update.timing', begin)

update方法，中首先是获得本地文件里当前设备所相应hashes.pkl文件里每一个后缀所相应的hahes值，形如{'a83': '0db7b416c9808517a1bb2157af20b09b'},当中key为文件内容hash值的后三字节，value为后缀目录下全部子目录下（即以文件内容的md5值为名字的目录）全部.data文件的文件名称字的md5值,能够理解为全部文件名称的md5值和。

            hashed, local_hash = tpool_reraise(

                get_hashes, job['path'],

                do_listdir=(self.replication_count % 10) == 0,

                reclaim_age=self.reclaim_age)

如上代码片段会运行get_hashes方法，并将后边參数传递给get_hashes

def get_hashes(partition_dir, recalculate=None, do_listdir=False,

               reclaim_age=ONE_WEEK):

    """

    Get a list of hashes for the suffix dir.  do_listdir causes it to mistrust

    the hash cache for suffix existence at the (unexpectedly high) cost of a

    listdir.  reclaim_age is just passed on to hash_suffix. 

    :param partition_dir: absolute path of partition to get hashes for

    :param recalculate: 形如 recalculate=['a83']

      list of suffixes（后缀，即 hash值的后缀  310即为后缀  root@kinglion-Lenovo-Product:/srv/1/node/sdb1/objects/94238# ls

   310  hashes.pkl   ） which should be recalculated（又一次计算） when got

    :param do_listdir: force existence check for all hashes in the partition（对partion中的hashe强行运行检查）

    :param reclaim_age: age at which to remove tombstones 

    :returns: tuple of (number of suffix dirs hashed, dictionary of hashes)

    """

因没有传递recalulate这个參数故仅仅有do_listdir为True时会强制运行又一次计算后缀文件下全部文件名称字的hash值。文件名称字是时间戳，时间戳变了说明文件有更新，故须要和远程同步，检查是否为同一个版本号，不是同一个版本号的须要把本地版本号传递给远程server。

attempts_left = len(job['nodes'])

            #此时的nodes为除去本节点外的全部节点 由于 job['nodes]不包括本地节点get_more_nodes(int(job['partition']))能获得除去本partion所相应节点 外的其它全部节点

            nodes = itertools.chain(

                job['nodes'],

                self.object_ring.get_more_nodes(int(job['partition'])))

如上代码片段，attempts_left为当前job相应的分区去掉本地节点的其它的备份节点的个数。得到attempts_left后，下边接着更新了nodes，当中get_more_nodes方法会得到出去本分区所相应节点之外的其它全部节点的迭代器，全部nodes是除去本节点外全部节点的一个迭代器。

下边就是while循环，循环attempts_left次，

resp = http_connect(

                            node['replication_ip'], node['replication_port'],

                            node['device'], job['partition'], 'REPLICATE',

                            '', headers=self.headers).getresponse()

依据迭代得到的node请求，因副本节点首先被迭代到，故首先请求副本节点。若果成功请求读取resp返回的内容，得到远程设备同一个partion下的remote_hash

suffixes = [suffix for suffix in local_hash if

                                local_hash[suffix] !=

                                remote_hash.get(suffix, -1)]

                    #假设没有说明没有变动，则继续请求下一个节点

                    if not suffixes:

                        continue

对照两个设备同样partion下的hashes.pkl文件同样key而value不同的key。suffixes则说明和远程备份文件都是同一个版本号，继续请求下一个备份。假设不为空，则须要处理，同一时候再一次得到自己hashes.pkl目录中的内容，由于上一次请求时间中可能有其它的备份已经有新的更新推送到本server了。得到本地最新的hashes.pkl内容后再一次对照，得到不同的同样分区下的不同后缀

运行同步：

self.sync(node, job, suffixes)  #同步变化的

在同步变化时作者如今使用rsync方法，没有使用ssync，只是已经留出了ssync的实现，当ssync方法稳定时就会把rsync替换掉。（敬请期待）

 def sync(self, node, job, suffixes):  # Just exists for doc anchor point

        """

        Synchronize local suffix directories from a partition with a remote

        node.

        :param node: the "dev" entry for the remote node to sync with

        :param job: information about the partition being synced

        :param suffixes: a list of suffixes which need to be pushed

        :returns: boolean indicating success or failure

        """

        # self.sync_method = getattr(self, conf.get('sync_method') or 'rsync')

        #配置没有 sync_method方法 则运行类自己的rsync方法

        return self.sync_method(node, job, suffixes)

sync_method方法从例如以下获得，没有配置则运行rsync方法

self.sync_method = getattr(self, conf.get('sync_method') or 'rsync')

def rsync(self, node, job, suffixes):

        """

        Uses rsync to implement the sync method. This was the first

        sync method in Swift.

        """

        if not os.path.exists(job['path']):

            return False

        args = [

            'rsync',

            '--recursive',

            '--whole-file',

            '--human-readable',

            '--xattrs',

            '--itemize-changes',

            '--ignore-existing',

            '--timeout=%s' % self.rsync_io_timeout,

            '--contimeout=%s' % self.rsync_io_timeout,

            '--bwlimit=%s' % self.rsync_bwlimit,

        ]

        node_ip = rsync_ip(node['replication_ip'])

        #包括了ip信息

        if self.vm_test_mode:

            rsync_module = '%s::object%s' % (node_ip, node['replication_port'])

        else:

            rsync_module = '%s::object' % node_ip

        had_any = False

        for suffix in suffixes:

            spath = join(job['path'], suffix)

            if os.path.exists(spath):

                args.append(spath)

                had_any = True

        if not had_any:

            return False

        args.append(join(rsync_module, node['device'],

                    'objects', job['partition']))

        #args里面包括了通的全部信息 包括设备名称，设备分区

        return self._rsync(args) == 0

rsync方法将接受的參数都放到args中，然后运行_rsync方法。

    def _rsync(self, args):

        """

        Execute the rsync binary to replicate a partition.

        :returns: return code of rsync process. 0 is successful

        """

        start_time = time.time()

        ret_val = None

        try:

            with Timeout(self.rsync_timeout):

                #此处即为同步操作了，推送模式

                proc = subprocess.Popen(args,

                                        stdout=subprocess.PIPE,

                                        stderr=subprocess.STDOUT)

                results = proc.stdout.read()

                ret_val = proc.wait()

        except Timeout:

            self.logger.error(_("Killing long-running rsync: %s"), str(args))

            proc.kill()

            return 1  # failure response code

        total_time = time.time() - start_time

        for result in results.split('\n'):

            if result == '':

                continue

            if result.startswith('cd+'):

                continue

            if not ret_val:

                self.logger.info(result)

            else:

                self.logger.error(result)

        if ret_val:

            error_line = _('Bad rsync return code: %(ret)d <- %(args)s') % \

                {'args': str(args), 'ret': ret_val}

            if self.rsync_error_log_line_length:

                error_line = error_line[:self.rsync_error_log_line_length]

            self.logger.error(error_line)

        elif results:

            self.logger.info(

                _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"),

                {'src': args[-2], 'dst': args[-1], 'time': total_time})

        else:

            self.logger.debug(

                _("Successful rsync of %(src)s at %(dst)s (%(time).03f)"),

                {'src': args[-2], 'dst': args[-1], 'time': total_time})

        return ret_val

当中例如以下代码片段就是运行详细的推送：

  #此处即为同步操作了，推送模式

                proc = subprocess.Popen(args,

                                        stdout=subprocess.PIPE,

                                        stderr=subprocess.STDOUT)

若job[delete]为True出现这样的情况的可能就是，因增删了设备，Ring 又一次调整，当前partion中的备份不再有此server的ID如partion号为45678的在rebalance前的对于的备份设备的id为[1,2,3]，假设当前设备id为1，则又一次rebalance后当前partion相应的备份为[4,2,3],则就会出现job[delete]为True的情况，我们看其代码详细实现：

    def update_deleted(self, job):

        """

        High-level method that replicates a single partition that doesn't

        belong on (不应放在 )this node.

        :param job: a dict containing info about the partition to be replicated

        """

        #得到parition下相应的后缀

        def tpool_get_suffixes(path):

            return [suff for suff in os.listdir(path)

                    if len(suff) == 3 and isdir(join(path, suff))]

        self.replication_count += 1

        self.logger.increment('partition.delete.count.%s' % (job['device'],))

        begin = time.time()

        try:

            responses = []

            suffixes = tpool.execute(tpool_get_suffixes, job['path'])

            if suffixes:

                for node in job['nodes']:

                    success = self.sync(node, job, suffixes)      #运行同步

                    if success:

                        with Timeout(self.http_timeout):

                            conn = http_connect(

                                node['replication_ip'],

                                node['replication_port'],

                                node['device'], job['partition'], 'REPLICATE',

                                '/' + '-'.join(suffixes), headers=self.headers)

                            conn.getresponse().read()

                    responses.append(success)

            if self.handoff_delete:

                # delete handoff if we have had handoff_delete successes

                delete_handoff = len([resp for resp in responses if resp]) >= \

                    self.handoff_delete

            else:

                # delete handoff if all syncs were successful

                delete_handoff = len(responses) == len(job['nodes']) and \

                    all(responses)

            #suffixes为空或 请求的三个已经都响应成功后删除本地partion下的文件

            if not suffixes or delete_handoff:

                self.logger.info(_("Removing partition: %s"), job['path'])

                tpool.execute(shutil.rmtree, job['path'], ignore_errors=True)

        except (Exception, Timeout):

            self.logger.exception(_("Error syncing handoff partition"))

        finally:

            self.partition_times.append(time.time() - begin)

            self.logger.timing_since('partition.delete.timing', begin)

至此 replicate操作就解说完成,文中若有理解不合理之处，请指正，谢谢！

秒客网

OpenStack_Swift源代码分析——ObjectReplicator源代码分析(2)

1、Replicator运行代码具体分析

相关文章