Flink - FlinkKafkaConsumer08

先看

AbstractFetcher

这个可以理解就是，consumer中具体去kafka读数据的线程，一个fetcher可以同时读多个partitions的数据来看看

/**

 * Base class for all fetchers, which implement the connections to Kafka brokers and

 * pull records from Kafka partitions.

 *

 * <p>This fetcher base class implements the logic around emitting records and tracking offsets,

 * as well as around the optional timestamp assignment and watermark generation.

 *

 * @param <T> The type of elements deserialized from Kafka's byte records, and emitted into

 *            the Flink data streams.

 * @param <KPH> The type of topic/partition identifier used by Kafka in the specific version.

 */

public abstract class AbstractFetcher<T, KPH> {

    /** The source context to emit records and watermarks to */

    private final SourceContext<T> sourceContext; //用于发送数据的context

    /** All partitions (and their state) that this fetcher is subscribed to */

    private final KafkaTopicPartitionState<KPH>[] allPartitions; //用于记录每个topic partition的状态，比如offset

    // ------------------------------------------------------------------------

    protected AbstractFetcher(

            SourceContext<T> sourceContext,

            List<KafkaTopicPartition> assignedPartitions,

            SerializedValue<AssignerWithPeriodicWatermarks<T>> watermarksPeriodic,

            SerializedValue<AssignerWithPunctuatedWatermarks<T>> watermarksPunctuated,

            StreamingRuntimeContext runtimeContext) throws Exception

    {

        // create our partition state according to the timestamp/watermark mode

        this.allPartitions = initializePartitions(

                assignedPartitions,

                timestampWatermarkMode,

                watermarksPeriodic, watermarksPunctuated,

                runtimeContext.getUserCodeClassLoader());

        // if we have periodic watermarks, kick off the interval scheduler

        if (timestampWatermarkMode == PERIODIC_WATERMARKS) {

            KafkaTopicPartitionStateWithPeriodicWatermarks<?, ?>[] parts =

                    (KafkaTopicPartitionStateWithPeriodicWatermarks<?, ?>[]) allPartitions;

            PeriodicWatermarkEmitter periodicEmitter =

                    new PeriodicWatermarkEmitter(parts, sourceContext, runtimeContext);

            periodicEmitter.start();  //定期的发出waterMark

        }

    }

    // ------------------------------------------------------------------------

    //  Core fetcher work methods

    // ------------------------------------------------------------------------

    public abstract void runFetchLoop() throws Exception; //核心的函数，需要重载

    // ------------------------------------------------------------------------

    //  Kafka version specifics

    // ------------------------------------------------------------------------

    /**

     * Creates the Kafka version specific representation of the given

     * topic partition.

     *

     * @param partition The Flink representation of the Kafka topic partition.

     * @return The specific Kafka representation of the Kafka topic partition.

     */

    public abstract KPH createKafkaPartitionHandle(KafkaTopicPartition partition);//生成KafkaPartitionHandle，这个其实是kafka中对partition的描述

    /**

     * Commits the given partition offsets to the Kafka brokers (or to ZooKeeper for

     * older Kafka versions).

     *

     * @param offsets The offsets to commit to Kafka.

     * @throws Exception This method forwards exceptions.

     */

    public abstract void commitSpecificOffsetsToKafka(Map<KafkaTopicPartition, Long> offsets) throws Exception; //如果commit相应的kafka offset，比如写到zk

    // ------------------------------------------------------------------------

    //  snapshot and restore the state

    // ------------------------------------------------------------------------

    /**

     * Takes a snapshot of the partition offsets.

     *

     * <p>Important: This method mus be called under the checkpoint lock.

     *

     * @return A map from partition to current offset.

     */

    public HashMap<KafkaTopicPartition, Long> snapshotCurrentState() { //产生所有partitions的snapshot，主要是offset

        // this method assumes that the checkpoint lock is held

        assert Thread.holdsLock(checkpointLock);

        HashMap<KafkaTopicPartition, Long> state = new HashMap<>(allPartitions.length);

        for (KafkaTopicPartitionState<?> partition : subscribedPartitions()) {

            if (partition.isOffsetDefined()) {

                state.put(partition.getKafkaTopicPartition(), partition.getOffset());

            }

        }

        return state;

    }

    /**

     * Restores the partition offsets.

     *

     * @param snapshotState The offsets for the partitions

     */

    public void restoreOffsets(HashMap<KafkaTopicPartition, Long> snapshotState) { //从checkpoint中去恢复offset

        for (KafkaTopicPartitionState<?> partition : allPartitions) {

            Long offset = snapshotState.get(partition.getKafkaTopicPartition());

            if (offset != null) {

                partition.setOffset(offset);

            }

        }

    }

    // ------------------------------------------------------------------------

    //  emitting records

    // ------------------------------------------------------------------------

    /**

     *

     * <p>Implementation Note: This method is kept brief to be JIT inlining friendly.

     * That makes the fast path efficient, the extended paths are called as separate methods.

     *

     * @param record The record to emit

     * @param partitionState The state of the Kafka partition from which the record was fetched

     * @param offset The offset from which the record was fetched

     */

    protected final void emitRecord(T record, KafkaTopicPartitionState<KPH> partitionState, long offset) { //真正的emit record

        if (timestampWatermarkMode == NO_TIMESTAMPS_WATERMARKS) {

            // fast path logic, in case there are no watermarks

            // emit the record, using the checkpoint lock to guarantee

            // atomicity of record emission and offset state update

            synchronized (checkpointLock) {

                sourceContext.collect(record); //发出record

                partitionState.setOffset(offset); //更新local offset

            }

        }

        else if (timestampWatermarkMode == PERIODIC_WATERMARKS) { //如果有需要定期的watermark

            emitRecordWithTimestampAndPeriodicWatermark(record, partitionState, offset);

        }

        else {

            emitRecordWithTimestampAndPunctuatedWatermark(record, partitionState, offset);

        }

    }

    /**

     * Record emission, if a timestamp will be attached from an assigner that is

     * also a periodic watermark generator.

     */

    private void emitRecordWithTimestampAndPeriodicWatermark(

            T record, KafkaTopicPartitionState<KPH> partitionState, long offset)

    {

        // extract timestamp - this accesses/modifies the per-partition state inside the

        // watermark generator instance, so we need to lock the access on the

        // partition state. concurrent access can happen from the periodic emitter

        final long timestamp;

        //noinspection SynchronizationOnLocalVariableOrMethodParameter

        synchronized (withWatermarksState) {

            timestamp = withWatermarksState.getTimestampForRecord(record); //调用waterMark.extractTimestamp来获取该record的event time

        }

        // emit the record with timestamp, using the usual checkpoint lock to guarantee

        // atomicity of record emission and offset state update

        synchronized (checkpointLock) {

            sourceContext.collectWithTimestamp(record, timestamp); //这个emit接口，会在发送record的情况下，还加上event time

            partitionState.setOffset(offset);

        }

    }

    // ------------------------------------------------------------------------

    /**

     * The periodic watermark emitter. In its given interval, it checks all partitions for

     * the current event time watermark, and possibly emits the next watermark.

     */

    private static class PeriodicWatermarkEmitter implements Triggerable {

        private final KafkaTopicPartitionStateWithPeriodicWatermarks<?, ?>[] allPartitions;

        private final SourceContext<?> emitter;

        private final StreamingRuntimeContext triggerContext;

        private final long interval;

        private long lastWatermarkTimestamp;

        //-------------------------------------------------

        PeriodicWatermarkEmitter(

                KafkaTopicPartitionStateWithPeriodicWatermarks<?, ?>[] allPartitions,

                SourceContext<?> emitter,

                StreamingRuntimeContext runtimeContext)

        {

            this.allPartitions = checkNotNull(allPartitions);

            this.emitter = checkNotNull(emitter);

            this.triggerContext = checkNotNull(runtimeContext);

            this.interval = runtimeContext.getExecutionConfig().getAutoWatermarkInterval();

            this.lastWatermarkTimestamp = Long.MIN_VALUE;

        }

        //-------------------------------------------------

        public void start() {

            triggerContext.registerTimer(System.currentTimeMillis() + interval, this); //注册timer

        }

        @Override

        public void trigger(long timestamp) throws Exception {

            // sanity check

            assert Thread.holdsLock(emitter.getCheckpointLock());

            long minAcrossAll = Long.MAX_VALUE;

            for (KafkaTopicPartitionStateWithPeriodicWatermarks<?, ?> state : allPartitions) {

                // we access the current watermark for the periodic assigners under the state

                // lock, to prevent concurrent modification to any internal variables

                final long curr;

                //noinspection SynchronizationOnLocalVariableOrMethodParameter

                synchronized (state) {

                    curr = state.getCurrentWatermarkTimestamp(); //获取waterMark

                }

                minAcrossAll = Math.min(minAcrossAll, curr);

            }

            // emit next watermark, if there is one

            if (minAcrossAll > lastWatermarkTimestamp) {

                lastWatermarkTimestamp = minAcrossAll;

                emitter.emitWatermark(new Watermark(minAcrossAll)); //emit waterMark

            }

            // schedule the next watermark

            triggerContext.registerTimer(System.currentTimeMillis() + interval, this); //再次注册timer

        }

    }

}

Kafka08Fetcher

基于kafka 0.8版本的fetcher，

public class Kafka08Fetcher<T> extends AbstractFetcher<T, TopicAndPartition>

核心的函数，是重写

runFetchLoop

@Override

    public void runFetchLoop() throws Exception {

        // the map from broker to the thread that is connected to that broker

        final Map<Node, SimpleConsumerThread<T>> brokerToThread = new HashMap<>(); //cache每个partition node到每个SimpleConsumerThread的对应关系

        // the offset handler handles the communication with ZooKeeper, to commit externally visible offsets

        final ZookeeperOffsetHandler zookeeperOffsetHandler = new ZookeeperOffsetHandler(kafkaConfig); //Zookeeper Handler，用于r/w数据到zookeeper

        this.zookeeperOffsetHandler = zookeeperOffsetHandler;

        PeriodicOffsetCommitter periodicCommitter = null;

        try {

            // read offsets from ZooKeeper for partitions that did not restore offsets

            {

                List<KafkaTopicPartition> partitionsWithNoOffset = new ArrayList<>();

                for (KafkaTopicPartitionState<TopicAndPartition> partition : subscribedPartitions()) {

                    if (!partition.isOffsetDefined()) {  //遍历每个partition，如果没有定义offset，即offset没有从checkpoint中恢复，加入partitionsWithNoOffset

                        partitionsWithNoOffset.add(partition.getKafkaTopicPartition());

                    }

                }

                //这步仅仅对于没有从ckeckpoint中读到offset的partitionsWithNoOffset

                Map<KafkaTopicPartition, Long> zkOffsets = zookeeperOffsetHandler.getOffsets(partitionsWithNoOffset); //从zk中读出，相应partition的offset

                for (KafkaTopicPartitionState<TopicAndPartition> partition : subscribedPartitions()) {

                    Long offset = zkOffsets.get(partition.getKafkaTopicPartition());

                    if (offset != null) {

                        partition.setOffset(offset); //为partition设置从zk中读出的offset

                    }

                }

            }

            // start the periodic offset committer thread, if necessary

            if (autoCommitInterval > 0) { //定期触发commit offsets，比如发送到zk，路径，topic_groupid + "/" + partition;

                periodicCommitter = new PeriodicOffsetCommitter(zookeeperOffsetHandler,

                        subscribedPartitions(), errorHandler, autoCommitInterval);

                periodicCommitter.setName("Periodic Kafka partition offset committer");

                periodicCommitter.setDaemon(true);

                periodicCommitter.start();

            }

            // Main loop polling elements from the unassignedPartitions queue to the threads

            while (running) {

                // wait for max 5 seconds trying to get partitions to assign

                // if threads shut down, this poll returns earlier, because the threads inject the

                // special marker into the queue

                List<KafkaTopicPartitionState<TopicAndPartition>> partitionsToAssign =

                        unassignedPartitionsQueue.getBatchBlocking(5000);

                partitionsToAssign.remove(MARKER); //这边这个marker干嘛用的。。。，防止上面被block？

                if (!partitionsToAssign.isEmpty()) {

                    LOG.info("Assigning {} partitions to broker threads", partitionsToAssign.size());

                    Map<Node, List<KafkaTopicPartitionState<TopicAndPartition>>> partitionsWithLeaders =  //通过broker server找到partitions的leader，返回的结果，map（leader <-> partition list）

                            findLeaderForPartitions(partitionsToAssign, kafkaConfig);

                    // assign the partitions to the leaders (maybe start the threads)

                    for (Map.Entry<Node, List<KafkaTopicPartitionState<TopicAndPartition>>> partitionsWithLeader :

                            partitionsWithLeaders.entrySet())

                    {

                        final Node leader = partitionsWithLeader.getKey(); //leader node

                        final List<KafkaTopicPartitionState<TopicAndPartition>> partitions = partitionsWithLeader.getValue(); //这个leader node可以读取的partition列表

                        SimpleConsumerThread<T> brokerThread = brokerToThread.get(leader); //找到leader node对应的consumer thread

                        if (brokerThread == null || !brokerThread.getNewPartitionsQueue().isOpen()) {

                            // start new thread

                            brokerThread = createAndStartSimpleConsumerThread(partitions, leader, errorHandler); //如果没有相应的consumer thread，创建新的consumer thread

                            brokerToThread.put(leader, brokerThread); //

                        }

                        else {

                            // put elements into queue of thread

                            ClosableBlockingQueue<KafkaTopicPartitionState<TopicAndPartition>> newPartitionsQueue =  //

                                    brokerThread.getNewPartitionsQueue();

                            for (KafkaTopicPartitionState<TopicAndPartition> fp : partitions) {

                                if (!newPartitionsQueue.addIfOpen(fp)) {  //

                                    // we were unable to add the partition to the broker's queue

                                    // the broker has closed in the meantime (the thread will shut down)

                                    // create a new thread for connecting to this broker

                                    List<KafkaTopicPartitionState<TopicAndPartition>> seedPartitions = new ArrayList<>();

                                    seedPartitions.add(fp);

                                    brokerThread = createAndStartSimpleConsumerThread(seedPartitions, leader, errorHandler);

                                    brokerToThread.put(leader, brokerThread);

                                    newPartitionsQueue = brokerThread.getNewPartitionsQueue(); // update queue for the subsequent partitions

                                }

                            }

                        }

                    }

                }

            }

        }

        catch (InterruptedException e) {

           //......

        }

        finally {

           //......

        }

    }

其他一些接口实现，

// ------------------------------------------------------------------------

    //  Kafka 0.8 specific class instantiation

    // ------------------------------------------------------------------------

    @Override

    public TopicAndPartition createKafkaPartitionHandle(KafkaTopicPartition partition) {

        return new TopicAndPartition(partition.getTopic(), partition.getPartition()); //对于kafka0.8，KafkaPartitionHandle就是TopicAndPartition

    }

    // ------------------------------------------------------------------------

    //  Offset handling

    // ------------------------------------------------------------------------

    @Override

    public void commitSpecificOffsetsToKafka(Map<KafkaTopicPartition, Long> offsets) throws Exception {

        ZookeeperOffsetHandler zkHandler = this.zookeeperOffsetHandler;

        if (zkHandler != null) {

            zkHandler.writeOffsets(offsets); //commit offsets是写到zookeeper的

        }

    }

    // ------------------------------------------------------------------------

    //  Utilities

    // ------------------------------------------------------------------------

    private SimpleConsumerThread<T> createAndStartSimpleConsumerThread(

            List<KafkaTopicPartitionState<TopicAndPartition>> seedPartitions,

            Node leader,

            ExceptionProxy errorHandler) throws IOException, ClassNotFoundException

    {

        // each thread needs its own copy of the deserializer, because the deserializer is

        // not necessarily thread safe

        final KeyedDeserializationSchema<T> clonedDeserializer =

                InstantiationUtil.clone(deserializer, userCodeClassLoader);

        // seed thread with list of fetch partitions (otherwise it would shut down immediately again

        SimpleConsumerThread<T> brokerThread = new SimpleConsumerThread<>(

                this, errorHandler, kafkaConfig, leader, seedPartitions, unassignedPartitionsQueue,

                clonedDeserializer, invalidOffsetBehavior);

        brokerThread.setName(String.format("SimpleConsumer - %s - broker-%s (%s:%d)",

                taskName, leader.id(), leader.host(), leader.port()));

        brokerThread.setDaemon(true);

        brokerThread.start(); //创建和启动SimpleConsumerThread

        LOG.info("Starting thread {}", brokerThread.getName());

        return brokerThread;

    }

下面来看看SimpleConsumerThread

class SimpleConsumerThread<T> extends Thread

核心函数run，主要做的是，不停的读取数据的事情，

// these are the actual configuration values of Kafka + their original default values.

    this.soTimeout = getInt(config, "socket.timeout.ms", 30000); //Kafka的一些配置

    this.minBytes = getInt(config, "fetch.min.bytes", 1);

    this.maxWait = getInt(config, "fetch.wait.max.ms", 100);

    this.fetchSize = getInt(config, "fetch.message.max.bytes", 1048576);

    this.bufferSize = getInt(config, "socket.receive.buffer.bytes", 65536);

    this.reconnectLimit = getInt(config, "flink.simple-consumer-reconnectLimit", 3);

    // ------------------------------------------------------------------------

    //  main work loop

    // ------------------------------------------------------------------------

    @Override

    public void run() {

        try {

            // create the Kafka consumer that we actually use for fetching

            consumer = new SimpleConsumer(broker.host(), broker.port(), soTimeout, bufferSize, clientId); //创建SimpleConsumer

            // make sure that all partitions have some offsets to start with

            // those partitions that do not have an offset from a checkpoint need to get

            // their start offset from ZooKeeper

            getMissingOffsetsFromKafka(partitions);  //为没有offset信息的partition，重置offset，从latest或earlist

            // Now, the actual work starts :-)

            int offsetOutOfRangeCount = 0; //用于统计实际执行情况，非法offset，或重连的计数

            int reconnects = 0;

            while (running) {

                // ----------------------------------- partitions list maintenance ----------------------------

                // check queue for new partitions to read from:

                List<KafkaTopicPartitionState<TopicAndPartition>> newPartitions = newPartitionsQueue.pollBatch(); //对于new partitions的处理，主要就是把它们加到partitions当中

                if (newPartitions != null) {

                    // found some new partitions for this thread's broker

                    // check if the new partitions need an offset lookup

                    getMissingOffsetsFromKafka(newPartitions); // 为新的partition重置offset 

                    // add the new partitions (and check they are not already in there)

                    for (KafkaTopicPartitionState<TopicAndPartition> newPartition: newPartitions) {

                        partitions.add(newPartition);

                    }

                }

                if (partitions.size() == 0) { //如果partitions为空，即没有需要消费的partition

                    if (newPartitionsQueue.close()) { //如果此时newPartitionsQueue为closed，那么就不可能会有新的partitions加入，那么该thread就没有存在的意义，不需要继续run

                        // close succeeded. Closing thread

                        running = false; //关闭线程

                        LOG.info("Consumer thread {} does not have any partitions assigned anymore. Stopping thread.",

                                getName());

                        // add the wake-up marker into the queue to make the main thread

                        // immediately wake up and termination faster

                        unassignedPartitions.add(MARKER);

                        break;

                    } else { //如果newPartitionsQueue没有被关闭，那就等待新的partitions，continue

                        // close failed: fetcher main thread concurrently added new partitions into the queue.

                        // go to top of loop again and get the new partitions

                        continue;

                    }

                }

                // ----------------------------------- request / response with kafka ----------------------------

                FetchRequestBuilder frb = new FetchRequestBuilder(); //创建FetchRequestBuilder

                frb.clientId(clientId);

                frb.maxWait(maxWait);

                frb.minBytes(minBytes);

                for (KafkaTopicPartitionState<?> partition : partitions) {

                    frb.addFetch(

                            partition.getKafkaTopicPartition().getTopic(),

                            partition.getKafkaTopicPartition().getPartition(),

                            partition.getOffset() + 1, // request the next record

                            fetchSize);

                }

                kafka.api.FetchRequest fetchRequest = frb.build(); //创建FetchRequest，一个request可以同时读多个partition，取决于partition和consumer数量的比例

                FetchResponse fetchResponse;

                try {

                    fetchResponse = consumer.fetch(fetchRequest); //从kafka读到数据，包含在FetchResponse中，其中包含从多个partition中取到的数据

                }

                catch (Throwable cce) {

                    //noinspection ConstantConditions

                    if (cce instanceof ClosedChannelException) { //链接kafka异常

                        // we don't know if the broker is overloaded or unavailable.

                        // retry a few times, then return ALL partitions for new leader lookup

                        if (++reconnects >= reconnectLimit) {  //如果达到重连limit，说明确实无法连接到kafka

                            LOG.warn("Unable to reach broker after {} retries. Returning all current partitions", reconnectLimit);

                            for (KafkaTopicPartitionState<TopicAndPartition> fp: this.partitions) {

                                unassignedPartitions.add(fp); //把负责的partitions加入unassignedPartitions，表明这些partition是没人处理的

                            }

                            this.partitions.clear(); //把partitons清空

                            continue; // jump to top of loop: will close thread or subscribe to new partitions //这步会走到上面的partitions.size() == 0的逻辑

                        }

                        try { //如果需要重试，先关闭consumer，然后重新创建consumer，然后continue重试

                            consumer.close();

                        } catch (Throwable t) {

                            LOG.warn("Error while closing consumer connection", t);

                        }

                        // delay & retry

                        Thread.sleep(100);

                        consumer = new SimpleConsumer(broker.host(), broker.port(), soTimeout, bufferSize, clientId);

                        continue; // retry

                    } else {

                        throw cce;

                    }

                }

                reconnects = 0;

                // ---------------------------------------- error handling ----------------------------

                if (fetchResponse == null) {

                    throw new IOException("Fetch from Kafka failed (request returned null)");

                }

                if (fetchResponse.hasError()) { //如果fetchResponse有错误

                    String exception = "";

                    List<KafkaTopicPartitionState<TopicAndPartition>> partitionsToGetOffsetsFor = new ArrayList<>();

                    // iterate over partitions to get individual error codes

                    Iterator<KafkaTopicPartitionState<TopicAndPartition>> partitionsIterator = partitions.iterator();

                    boolean partitionsRemoved = false;

                    while (partitionsIterator.hasNext()) {

                        final KafkaTopicPartitionState<TopicAndPartition> fp = partitionsIterator.next();

                        short code = fetchResponse.errorCode(fp.getTopic(), fp.getPartition()); //取得对于该partition的error code

                        if (code == ErrorMapping.OffsetOutOfRangeCode()) { //非法offset，那么需要重新初始化该partition的offset

                            // we were asked to read from an out-of-range-offset (maybe set wrong in Zookeeper)

                            // Kafka's high level consumer is resetting the offset according to 'auto.offset.reset'

                            partitionsToGetOffsetsFor.add(fp);

                        }

                        else if (code == ErrorMapping.NotLeaderForPartitionCode() ||  //如果由于各种不可用，导致无法从该broker上读取到partition的数据

                                code == ErrorMapping.LeaderNotAvailableCode() ||

                                code == ErrorMapping.BrokerNotAvailableCode() ||

                                code == ErrorMapping.UnknownCode())

                        {

                            // the broker we are connected to is not the leader for the partition.

                            LOG.warn("{} is not the leader of {}. Reassigning leader for partition", broker, fp);

                            LOG.debug("Error code = {}", code);

                            unassignedPartitions.add(fp); //那么把该partition放回unassignedPartitions，等待重新分配

                            partitionsIterator.remove(); // unsubscribe the partition ourselves，从当前的partitions列表中把该partition删除

                            partitionsRemoved = true;

                        }

                        else if (code != ErrorMapping.NoError()) {

                            exception += "\nException for " + fp.getTopic() +":"+ fp.getPartition() + ": " +

                                    StringUtils.stringifyException(ErrorMapping.exceptionFor(code));

                        }

                    }

                    if (partitionsToGetOffsetsFor.size() > 0) {

                        // safeguard against an infinite loop.

                        if (offsetOutOfRangeCount++ > 3) { //如果对于partitions，3次重置offset后，offset仍然有非法的，抛异常，防止无限循环

                            throw new RuntimeException("Found invalid offsets more than three times in partitions "

                                    + partitionsToGetOffsetsFor + " Exceptions: " + exception);

                        }

                        // get valid offsets for these partitions and try again.

                        LOG.warn("The following partitions had an invalid offset: {}", partitionsToGetOffsetsFor);

                        getLastOffsetFromKafka(consumer, partitionsToGetOffsetsFor, invalidOffsetBehavior); //重置这些partitions的offset, 根据配置会reset到earliest或latest

                        LOG.warn("The new partition offsets are {}", partitionsToGetOffsetsFor);

                        continue; // jump back to create a new fetch request. The offset has not been touched.

                    }

                    else if (partitionsRemoved) {

                        continue; // create new fetch request

                    }

                    else {

                        // partitions failed on an error

                        throw new IOException("Error while fetching from broker '" + broker +"': " + exception);

                    }

                } else {

                    // successful fetch, reset offsetOutOfRangeCount.

                    offsetOutOfRangeCount = 0;

                }

                // ----------------------------------- process fetch response ----------------------------

                int messagesInFetch = 0;

                int deletedMessages = 0;

                Iterator<KafkaTopicPartitionState<TopicAndPartition>> partitionsIterator = partitions.iterator();

                partitionsLoop:

                while (partitionsIterator.hasNext()) {

                    final KafkaTopicPartitionState<TopicAndPartition> currentPartition = partitionsIterator.next();

                    final ByteBufferMessageSet messageSet = fetchResponse.messageSet( //取出fetchResponse关于该partition的数据，封装成ByteBufferMessageSet

                            currentPartition.getTopic(), currentPartition.getPartition());

                    for (MessageAndOffset msg : messageSet) { //对于每天message

                        if (running) {

                            messagesInFetch++;

                            final ByteBuffer payload = msg.message().payload(); //读出message内容

                            final long offset = msg.offset();  //读出message offset

                            if (offset <= currentPartition.getOffset()) { //旧数据，ignore

                                // we have seen this message already

                                LOG.info("Skipping message with offset " + msg.offset()

                                        + " because we have seen messages until (including) "

                                        + currentPartition.getOffset()

                                        + " from topic/partition " + currentPartition.getTopic() + '/'

                                        + currentPartition.getPartition() + " already");

                                continue;

                            }

                            // If the message value is null, this represents a delete command for the message key.

                            // Log this and pass it on to the client who might want to also receive delete messages.

                            byte[] valueBytes;

                            if (payload == null) {

                                deletedMessages++;

                                valueBytes = null;

                            } else {

                                valueBytes = new byte[payload.remaining()];

                                payload.get(valueBytes); //将内容，读入valueBytes

                            }

                            // put key into byte array

                            byte[] keyBytes = null;

                            int keySize = msg.message().keySize();

                            if (keySize >= 0) { // message().hasKey() is doing the same. We save one int deserialization

                                ByteBuffer keyPayload = msg.message().key();

                                keyBytes = new byte[keySize]; //将key读入keyBytes

                                keyPayload.get(keyBytes);

                            }

                            final T value = deserializer.deserialize(keyBytes, valueBytes, //将message反序列化成对象

                                    currentPartition.getTopic(), currentPartition.getPartition(), offset);

                            if (deserializer.isEndOfStream(value)) {

                                // remove partition from subscribed partitions.

                                partitionsIterator.remove();

                                continue partitionsLoop;

                            }

                            owner.emitRecord(value, currentPartition, offset); //emit 数据

                        }

                        else {

                            // no longer running

                            return;

                        }

                    }

                }

            } // end of fetch loop

        }

    }

最后，看看

FlinkKafkaConsumerBase

/**

 * Base class of all Flink Kafka Consumer data sources.

 * This implements the common behavior across all Kafka versions.

 *

 * <p>The Kafka version specific behavior is defined mainly in the specific subclasses of the

 * {@link AbstractFetcher}.

 *

 * @param <T> The type of records produced by this data source

 */

public abstract class FlinkKafkaConsumerBase<T> extends RichParallelSourceFunction<T> implements

        CheckpointListener,

        CheckpointedAsynchronously<HashMap<KafkaTopicPartition, Long>>,

        ResultTypeQueryable<T>

这个是对所有版本kafka的抽象，

@Override

    public void run(SourceContext<T> sourceContext) throws Exception {

        // figure out which partitions this subtask should process

        final List<KafkaTopicPartition> thisSubtaskPartitions = assignPartitions(allSubscribedPartitions, //对应于topic partition数和consumer数，一个consumer应该分配哪些partitions，这里逻辑就是简单的取模

                getRuntimeContext().getNumberOfParallelSubtasks(), getRuntimeContext().getIndexOfThisSubtask());

        // we need only do work, if we actually have partitions assigned

        if (!thisSubtaskPartitions.isEmpty()) {

            // (1) create the fetcher that will communicate with the Kafka brokers

            final AbstractFetcher<T, ?> fetcher = createFetcher(  //创建Fetcher

                    sourceContext, thisSubtaskPartitions,

                    periodicWatermarkAssigner, punctuatedWatermarkAssigner,

                    (StreamingRuntimeContext) getRuntimeContext());

            // (2) set the fetcher to the restored checkpoint offsets

            if (restoreToOffset != null) {  //这个如果从checkpoint中读出offset状态

                fetcher.restoreOffsets(restoreToOffset); //恢复offset

            }

            // publish the reference, for snapshot-, commit-, and cancel calls

            // IMPORTANT: We can only do that now, because only now will calls to

            //            the fetchers 'snapshotCurrentState()' method return at least

            //            the restored offsets

            this.kafkaFetcher = fetcher;

            if (!running) {

                return;

            }

            // (3) run the fetcher' main work method

            fetcher.runFetchLoop();  //开始run fetcher

        }

    }

    @Override

    public HashMap<KafkaTopicPartition, Long> snapshotState(long checkpointId, long checkpointTimestamp) throws Exception {

        final AbstractFetcher<?, ?> fetcher = this.kafkaFetcher;

        HashMap<KafkaTopicPartition, Long> currentOffsets = fetcher.snapshotCurrentState(); //snapshot当前的offset

        // the map cannot be asynchronously updated, because only one checkpoint call can happen

        // on this function at a time: either snapshotState() or notifyCheckpointComplete()

        pendingCheckpoints.put(checkpointId, currentOffsets); //cache当前的checkpointid，等待该checkpoint完成

        // truncate the map, to prevent infinite growth

        while (pendingCheckpoints.size() > MAX_NUM_PENDING_CHECKPOINTS) { //删除过期的，或老的checkpoints

            pendingCheckpoints.remove(0);

        }

        return currentOffsets;

    }

    @Override

    public void restoreState(HashMap<KafkaTopicPartition, Long> restoredOffsets) {

        LOG.info("Setting restore state in the FlinkKafkaConsumer");

        restoreToOffset = restoredOffsets;

    }

    @Override

    public void notifyCheckpointComplete(long checkpointId) throws Exception { //当这个checkpoint完成时，需要通知kafkaconsumer，这个时候才会真正的commit offset

        final AbstractFetcher<?, ?> fetcher = this.kafkaFetcher;

        try {

            final int posInMap = pendingCheckpoints.indexOf(checkpointId);

            @SuppressWarnings("unchecked")

            HashMap<KafkaTopicPartition, Long> checkpointOffsets =

                    (HashMap<KafkaTopicPartition, Long>) pendingCheckpoints.remove(posInMap);

            // remove older checkpoints in map

            for (int i = 0; i < posInMap; i++) { //比该checkpoint老的未完成的checkpoint已经没有意义，删除

                pendingCheckpoints.remove(0);

            }

            fetcher.commitSpecificOffsetsToKafka(checkpointOffsets); //真正的commit offset，这个是通用接口，虽然对于kafka0.8，Fetcher里面本身也是会定期提交的，checkpoint一般秒级别比定期提交更频繁些

        }

    }

    /**

     * Selects which of the given partitions should be handled by a specific consumer,

     * given a certain number of consumers.

     *

     * @param allPartitions The partitions to select from

     * @param numConsumers The number of consumers

     * @param consumerIndex The index of the specific consumer

     *

     * @return The sublist of partitions to be handled by that consumer.

     */

    protected static List<KafkaTopicPartition> assignPartitions(

            List<KafkaTopicPartition> allPartitions,

            int numConsumers, int consumerIndex)

    {

        final List<KafkaTopicPartition> thisSubtaskPartitions = new ArrayList<>(

                allPartitions.size() / numConsumers + 1);

        for (int i = 0; i < allPartitions.size(); i++) {

            if (i % numConsumers == consumerIndex) {

                thisSubtaskPartitions.add(allPartitions.get(i));

            }

        }

        return thisSubtaskPartitions;

    }

针对kafka0.8的consumer

public class FlinkKafkaConsumer08<T> extends FlinkKafkaConsumerBase<T> {

    /**

     * Creates a new Kafka streaming source consumer for Kafka 0.8.x

     *

     * This constructor allows passing multiple topics and a key/value deserialization schema.

     *

     * @param topics

     *           The Kafka topics to read from.

     * @param deserializer

     *           The keyed de-/serializer used to convert between Kafka's byte messages and Flink's objects.

     * @param props

     *           The properties that are used to configure both the fetcher and the offset handler.

     */

    public FlinkKafkaConsumer08(List<String> topics, KeyedDeserializationSchema<T> deserializer, Properties props) {

        super(deserializer);

        // validate the zookeeper properties

        validateZooKeeperConfig(props);

        this.invalidOffsetBehavior = getInvalidOffsetBehavior(props); //当offset非法的时候，选择从哪里重置，这里支持earlist或latest

        this.autoCommitInterval = PropertiesUtil.getLong(props, "auto.commit.interval.ms", 60000); //offset commit的间隔，默认是1分钟

        // Connect to a broker to get the partitions for all topics

        List<KafkaTopicPartition> partitionInfos =

                KafkaTopicPartition.dropLeaderData(getPartitionsForTopic(topics, props)); //这里只是取出topic相关的partition的信息

        setSubscribedPartitions(partitionInfos); //将这部分，即该consumer消费的partitions，加入到SubscribedPartitions，表明这些已经有consumer消费了

    }

    /**

     * Send request to Kafka to get partitions for topic.

     *

     * @param topics The name of the topics.

     * @param properties The properties for the Kafka Consumer that is used to query the partitions for the topic.

     */

    public static List<KafkaTopicPartitionLeader> getPartitionsForTopic(List<String> topics, Properties properties) { //这里的逻辑是如果从kafka取得topic的partititon信息，这里配置配的是broker list，而非zk，所以他每次会随机从所有的brokers中挑一个去读取partitions信息

        String seedBrokersConfString = properties.getProperty(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG);

        final int numRetries = getInt(properties, GET_PARTITIONS_RETRIES_KEY, DEFAULT_GET_PARTITIONS_RETRIES);

        checkNotNull(seedBrokersConfString, "Configuration property %s not set", ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG);

        String[] seedBrokers = seedBrokersConfString.split(",");

        List<KafkaTopicPartitionLeader> partitions = new ArrayList<>();

        final String clientId = "flink-kafka-consumer-partition-lookup";

        final int soTimeout = getInt(properties, "socket.timeout.ms", 30000);

        final int bufferSize = getInt(properties, "socket.receive.buffer.bytes", 65536);

        Random rnd = new Random();

        retryLoop: for (int retry = 0; retry < numRetries; retry++) {

            // we pick a seed broker randomly to avoid overloading the first broker with all the requests when the

            // parallel source instances start. Still, we try all available brokers.

            int index = rnd.nextInt(seedBrokers.length);

            brokersLoop: for (int arrIdx = 0; arrIdx < seedBrokers.length; arrIdx++) {

                String seedBroker = seedBrokers[index];

                LOG.info("Trying to get topic metadata from broker {} in try {}/{}", seedBroker, retry, numRetries);

                if (++index == seedBrokers.length) {

                    index = 0;

                }

                URL brokerUrl = NetUtils.getCorrectHostnamePort(seedBroker);

                SimpleConsumer consumer = null;

                try {

                    consumer = new SimpleConsumer(brokerUrl.getHost(), brokerUrl.getPort(), soTimeout, bufferSize, clientId);

                    TopicMetadataRequest req = new TopicMetadataRequest(topics);

                    kafka.javaapi.TopicMetadataResponse resp = consumer.send(req);

                    List<TopicMetadata> metaData = resp.topicsMetadata();

                    // clear in case we have an incomplete list from previous tries

                    partitions.clear();

                    for (TopicMetadata item : metaData) {

                        if (item.errorCode() != ErrorMapping.NoError()) {

                            // warn and try more brokers

                            LOG.warn("Error while getting metadata from broker " + seedBroker + " to find partitions " +

                                    "for " + topics.toString() + ". Error: " + ErrorMapping.exceptionFor(item.errorCode()).getMessage());

                            continue brokersLoop;

                        }

                        if (!topics.contains(item.topic())) {

                            LOG.warn("Received metadata from topic " + item.topic() + " even though it was not requested. Skipping ...");

                            continue brokersLoop;

                        }

                        for (PartitionMetadata part : item.partitionsMetadata()) {

                            Node leader = brokerToNode(part.leader());

                            KafkaTopicPartition ktp = new KafkaTopicPartition(item.topic(), part.partitionId());

                            KafkaTopicPartitionLeader pInfo = new KafkaTopicPartitionLeader(ktp, leader);

                            partitions.add(pInfo);

                        }

                    }

                    break retryLoop; // leave the loop through the brokers

                } catch (Exception e) {

                    LOG.warn("Error communicating with broker " + seedBroker + " to find partitions for " + topics.toString() + "." +

                            "" + e.getClass() + ". Message: " + e.getMessage());

                    LOG.debug("Detailed trace", e);

                    // we sleep a bit. Retrying immediately doesn't make sense in cases where Kafka is reorganizing the leader metadata

                    try {

                        Thread.sleep(500);

                    } catch (InterruptedException e1) {

                        // sleep shorter.

                    }

                } finally {

                    if (consumer != null) {

                        consumer.close();

                    }

                }

            } // brokers loop

        } // retries loop

        return partitions;

    }

    private static long getInvalidOffsetBehavior(Properties config) {

        final String val = config.getProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "largest");

        if (val.equals("none")) {

            throw new IllegalArgumentException("Cannot use '" + ConsumerConfig.AUTO_OFFSET_RESET_CONFIG

                    + "' value 'none'. Possible values: 'latest', 'largest', or 'earliest'.");

        }

        else if (val.equals("largest") || val.equals("latest")) { // largest is kafka 0.8, latest is kafka 0.9

            return OffsetRequest.LatestTime();

        } else {

            return OffsetRequest.EarliestTime();

        }

    }

秒客网

Flink - FlinkKafkaConsumer08

相关文章