Flink

聊聊flink的JobManagerGateway

序本文主要研究一下flink的JobManagerGatewayRestfulGatewayflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/webmonitor/RestfulGateway.javapublic interface RestfulGateway extends RpcGateway { CompletableFuture<Acknowledge> cancelJob(JobID jobId, @RpcTimeout Time timeout); CompletableFuture<Acknowledge> stopJob(JobID jobId, @RpcTimeout Time timeout); CompletableFuture<String> requestRestAddress(@RpcTimeout Time timeout); CompletableFuture<? extends AccessExecutionGraph> requestJob(JobID jobId, @RpcTimeout Time timeout); CompletableFuture<JobResult> requestJobResult(JobID jobId, @RpcTimeout Time timeout); CompletableFuture<MultipleJobsDetails> requestMultipleJobDetails( @RpcTimeout Time timeout); CompletableFuture<ClusterOverview> requestClusterOverview(@RpcTimeout Time timeout); CompletableFuture<Collection<String>> requestMetricQueryServicePaths(@RpcTimeout Time timeout); CompletableFuture<Collection<Tuple2<ResourceID, String>>> requestTaskManagerMetricQueryServicePaths(@RpcTimeout Time timeout); default CompletableFuture<String> triggerSavepoint( JobID jobId, String targetDirectory, boolean cancelJob, @RpcTimeout Time timeout) { throw new UnsupportedOperationException(); } default CompletableFuture<Acknowledge> disposeSavepoint( final String savepointPath, @RpcTimeout final Time timeout) { throw new UnsupportedOperationException(); } default CompletableFuture<JobStatus> requestJobStatus( JobID jobId, @RpcTimeout Time timeout) { throw new UnsupportedOperationException(); } default CompletableFuture<OperatorBackPressureStatsResponse> requestOperatorBackPressureStats( JobID jobId, JobVertexID jobVertexId) { throw new UnsupportedOperationException(); } default CompletableFuture<Acknowledge> rescaleJob( JobID jobId, int newParallelism, RescalingBehaviour rescalingBehaviour, @RpcTimeout Time timeout) { throw new UnsupportedOperationException(); } default CompletableFuture<Acknowledge> shutDownCluster() { throw new UnsupportedOperationException(); }}RestfulGateway接口继承了RpcGateway接口，它定义了cancelJob、stopJob、requestRestAddress、requestJob、requestJobResult、requestMultipleJobDetails、requestClusterOverview、requestMetricQueryServicePaths、requestTaskManagerMetricQueryServicePaths方法；另外提供了triggerSavepoint、disposeSavepoint、requestJobStatus、requestOperatorBackPressureStats、rescaleJob、shutDownCluster这几个默认方法，其实现均抛出UnsupportedOperationException异常JobManagerGatewayflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/jobmaster/JobManagerGateway.javapublic interface JobManagerGateway extends RestfulGateway { CompletableFuture<Integer> requestBlobServerPort(Time timeout); CompletableFuture<Acknowledge> submitJob(JobGraph jobGraph, ListeningBehaviour listeningBehaviour, Time timeout); CompletableFuture<String> cancelJobWithSavepoint(JobID jobId, @Nullable String savepointPath, Time timeout); CompletableFuture<Acknowledge> cancelJob(JobID jobId, Time timeout); CompletableFuture<Acknowledge> stopJob(JobID jobId, Time timeout); CompletableFuture<Optional<JobManagerMessages.ClassloadingProps>> requestClassloadingProps(JobID jobId, Time timeout); CompletableFuture<Optional<Instance>> requestTaskManagerInstance(ResourceID resourceId, Time timeout); CompletableFuture<Collection<Instance>> requestTaskManagerInstances(Time timeout); CompletableFuture<JobIdsWithStatusOverview> requestJobsOverview(Time timeout);}JobManagerGateway接口继承了RestfulGateway接口，它定义了requestBlobServerPort、submitJob、cancelJobWithSavepoint、cancelJob、stopJob、requestClassloadingProps、requestTaskManagerInstance、requestTaskManagerInstances、requestJobsOverview方法；它有一个实现类是AkkaJobManagerGatewayAkkaJobManagerGatewayflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/akka/AkkaJobManagerGateway.javapublic class AkkaJobManagerGateway implements JobManagerGateway { private final ActorGateway jobManagerGateway; private final String hostname; public AkkaJobManagerGateway(ActorGateway jobManagerGateway) { this.jobManagerGateway = Preconditions.checkNotNull(jobManagerGateway); final Option<String> optHostname = jobManagerGateway.actor().path().address().host(); hostname = optHostname.isDefined() ? optHostname.get() : “localhost”; } @Override public String getAddress() { return jobManagerGateway.path(); } @Override public String getHostname() { return hostname; } @Override public CompletableFuture<Integer> requestBlobServerPort(Time timeout) { return FutureUtils.toJava( jobManagerGateway .ask(JobManagerMessages.getRequestBlobManagerPort(), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(Integer.class))); } //——————————————————————————– // Job control //——————————————————————————– @Override public CompletableFuture<Acknowledge> submitJob(JobGraph jobGraph, ListeningBehaviour listeningBehaviour, Time timeout) { return FutureUtils .toJava( jobManagerGateway.ask( new JobManagerMessages.SubmitJob( jobGraph, listeningBehaviour), FutureUtils.toFiniteDuration(timeout))) .thenApply( (Object response) -> { if (response instanceof JobManagerMessages.JobSubmitSuccess) { JobManagerMessages.JobSubmitSuccess success = ((JobManagerMessages.JobSubmitSuccess) response); if (Objects.equals(success.jobId(), jobGraph.getJobID())) { return Acknowledge.get(); } else { throw new CompletionException(new FlinkException(“JobManager responded for wrong Job. This Job: " + jobGraph.getJobID() + “, response: " + success.jobId())); } } else if (response instanceof JobManagerMessages.JobResultFailure) { JobManagerMessages.JobResultFailure failure = ((JobManagerMessages.JobResultFailure) response); throw new CompletionException(new FlinkException(“Job submission failed.”, failure.cause())); } else { throw new CompletionException(new FlinkException(“Unknown response to SubmitJob message: " + response + ‘.’)); } } ); } @Override public CompletableFuture<String> cancelJobWithSavepoint(JobID jobId, String savepointPath, Time timeout) { CompletableFuture<JobManagerMessages.CancellationResponse> cancellationFuture = FutureUtils.toJava( jobManagerGateway .ask(new JobManagerMessages.CancelJobWithSavepoint(jobId, savepointPath), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(JobManagerMessages.CancellationResponse.class))); return cancellationFuture.thenApply( (JobManagerMessages.CancellationResponse response) -> { if (response instanceof JobManagerMessages.CancellationSuccess) { return ((JobManagerMessages.CancellationSuccess) response).savepointPath(); } else { throw new CompletionException(new FlinkException(“Cancel with savepoint failed.”, ((JobManagerMessages.CancellationFailure) response).cause())); } }); } @Override public CompletableFuture<Acknowledge> cancelJob(JobID jobId, Time timeout) { CompletableFuture<JobManagerMessages.CancellationResponse> responseFuture = FutureUtils.toJava( jobManagerGateway .ask(new JobManagerMessages.CancelJob(jobId), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(JobManagerMessages.CancellationResponse.class))); return responseFuture.thenApply( (JobManagerMessages.CancellationResponse response) -> { if (response instanceof JobManagerMessages.CancellationSuccess) { return Acknowledge.get(); } else { throw new CompletionException(new FlinkException(“Cancel job failed " + jobId + ‘.’, ((JobManagerMessages.CancellationFailure) response).cause())); } }); } @Override public CompletableFuture<Acknowledge> stopJob(JobID jobId, Time timeout) { CompletableFuture<JobManagerMessages.StoppingResponse> responseFuture = FutureUtils.toJava( jobManagerGateway .ask(new JobManagerMessages.StopJob(jobId), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(JobManagerMessages.StoppingResponse.class))); return responseFuture.thenApply( (JobManagerMessages.StoppingResponse response) -> { if (response instanceof JobManagerMessages.StoppingSuccess) { return Acknowledge.get(); } else { throw new CompletionException(new FlinkException(“Stop job failed " + jobId + ‘.’, ((JobManagerMessages.StoppingFailure) response).cause())); } }); } //——————————————————————————– // JobManager information //——————————————————————————– @Override public CompletableFuture<Optional<Instance>> requestTaskManagerInstance(ResourceID resourceId, Time timeout) { return FutureUtils.toJava( jobManagerGateway .ask(new JobManagerMessages.RequestTaskManagerInstance(resourceId), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.<JobManagerMessages.TaskManagerInstance>apply(JobManagerMessages.TaskManagerInstance.class))) .thenApply( (JobManagerMessages.TaskManagerInstance taskManagerResponse) -> { if (taskManagerResponse.instance().isDefined()) { return Optional.of(taskManagerResponse.instance().get()); } else { return Optional.empty(); } }); } @Override public CompletableFuture<Collection<Instance>> requestTaskManagerInstances(Time timeout) { CompletableFuture<JobManagerMessages.RegisteredTaskManagers> taskManagersFuture = FutureUtils.toJava( jobManagerGateway .ask(JobManagerMessages.getRequestRegisteredTaskManagers(), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(JobManagerMessages.RegisteredTaskManagers.class))); return taskManagersFuture.thenApply( JobManagerMessages.RegisteredTaskManagers::asJavaCollection); } @Override public CompletableFuture<Optional<JobManagerMessages.ClassloadingProps>> requestClassloadingProps(JobID jobId, Time timeout) { return FutureUtils .toJava(jobManagerGateway .ask( new JobManagerMessages.RequestClassloadingProps(jobId), FutureUtils.toFiniteDuration(timeout))) .thenApply( (Object response) -> { if (response instanceof JobManagerMessages.ClassloadingProps) { return Optional.of(((JobManagerMessages.ClassloadingProps) response)); } else if (response instanceof JobManagerMessages.JobNotFound) { return Optional.empty(); } else { throw new CompletionException(new FlinkException(“Unknown response: " + response + ‘.’)); } }); } @Override public CompletableFuture<MultipleJobsDetails> requestMultipleJobDetails(Time timeout) { return FutureUtils.toJava( jobManagerGateway .ask(new RequestJobDetails(true, true), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(MultipleJobsDetails.class))); } @Override public CompletableFuture<AccessExecutionGraph> requestJob(JobID jobId, Time timeout) { CompletableFuture<JobManagerMessages.JobResponse> jobResponseFuture = FutureUtils.toJava( jobManagerGateway .ask(new JobManagerMessages.RequestJob(jobId), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(JobManagerMessages.JobResponse.class))); return jobResponseFuture.thenApply( (JobManagerMessages.JobResponse jobResponse) -> { if (jobResponse instanceof JobManagerMessages.JobFound) { return ((JobManagerMessages.JobFound) jobResponse).executionGraph(); } else { throw new CompletionException(new FlinkJobNotFoundException(jobId)); } }); } @Override public CompletableFuture<JobResult> requestJobResult(JobID jobId, Time timeout) { return requestJob(jobId, timeout).thenApply(JobResult::createFrom); } @Override public CompletableFuture<ClusterOverview> requestClusterOverview(Time timeout) { return FutureUtils.toJava( jobManagerGateway .ask(RequestStatusOverview.getInstance(), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(ClusterOverview.class))); } @Override public CompletableFuture<Collection<String>> requestMetricQueryServicePaths(Time timeout) { final String jobManagerPath = getAddress(); final String jobManagerMetricQueryServicePath = jobManagerPath.substring(0, jobManagerPath.lastIndexOf(’/’) + 1) + MetricQueryService.METRIC_QUERY_SERVICE_NAME; return CompletableFuture.completedFuture( Collections.singleton(jobManagerMetricQueryServicePath)); } @Override public CompletableFuture<Collection<Tuple2<ResourceID, String>>> requestTaskManagerMetricQueryServicePaths(Time timeout) { return requestTaskManagerInstances(timeout) .thenApply( (Collection<Instance> instances) -> instances .stream() .map( (Instance instance) -> { final String taskManagerAddress = instance.getTaskManagerGateway().getAddress(); final String taskManagerMetricQuerServicePath = taskManagerAddress.substring(0, taskManagerAddress.lastIndexOf(’/’) + 1) + MetricQueryService.METRIC_QUERY_SERVICE_NAME + ‘_’ + instance.getTaskManagerID().getResourceIdString(); return Tuple2.of(instance.getTaskManagerID(), taskManagerMetricQuerServicePath); }) .collect(Collectors.toList())); } @Override public CompletableFuture<JobIdsWithStatusOverview> requestJobsOverview(Time timeout) { return FutureUtils.toJava( jobManagerGateway .ask(RequestJobsWithIDsOverview.getInstance(), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(JobIdsWithStatusOverview.class))); } @Override public CompletableFuture<String> requestRestAddress(Time timeout) { return FutureUtils.toJava( jobManagerGateway .ask(JobManagerMessages.getRequestRestAddress(), FutureUtils.toFiniteDuration(timeout)) .mapTo(ClassTag$.MODULE$.apply(String.class))); }}AkkaJobManagerGateway实现了JobManagerGateway接口，其构造器要求传入jobManagerGatewayrequestBlobServerPort方法传递RequestBlobManagerPort消息；submitJob方法传递SubmitJob消息；cancelJobWithSavepoint方法传递CancelJobWithSavepoint消息；cancelJob方法传递CancelJob消息；stopJob方法传递StopJob消息requestTaskManagerInstance方法传递RequestTaskManagerInstance消息；requestTaskManagerInstances方法传递RequestRegisteredTaskManagers消息；requestClassloadingProps方法传递RequestClassloadingProps消息；requestMultipleJobDetails方法传递RequestJobDetails消息；requestJob方法传递RequestJob消息；requestClusterOverview方法传递RequestStatusOverview消息；requestJobsOverview方法传递RequestJobsWithIDsOverview消息；requestRestAddress方法传递RequestRestAddress消息小结RestfulGateway接口继承了RpcGateway接口，它定义了cancelJob、stopJob、requestRestAddress、requestJob、requestJobResult、requestMultipleJobDetails、requestClusterOverview、requestMetricQueryServicePaths、requestTaskManagerMetricQueryServicePaths方法；另外提供了triggerSavepoint、disposeSavepoint、requestJobStatus、requestOperatorBackPressureStats、rescaleJob、shutDownCluster这几个默认方法，其实现均抛出UnsupportedOperationException异常JobManagerGateway接口继承了RestfulGateway接口，它定义了requestBlobServerPort、submitJob、cancelJobWithSavepoint、cancelJob、stopJob、requestClassloadingProps、requestTaskManagerInstance、requestTaskManagerInstances、requestJobsOverview方法；它有一个实现类是AkkaJobManagerGatewayAkkaJobManagerGateway实现了JobManagerGateway接口，其构造器要求传入jobManagerGateway；其大部分方法的实现均通过jobManagerGateway来传递消息，其中大量使用了JobManagerMessages里头定义的消息对象docJobManagerGateway ...

聊聊flink的MetricQueryServiceGateway

序本文主要研究一下flink的MetricQueryServiceGatewayMetricQueryServiceGatewayflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/webmonitor/retriever/MetricQueryServiceGateway.javapublic interface MetricQueryServiceGateway { CompletableFuture<MetricDumpSerialization.MetricSerializationResult> queryMetrics(Time timeout); String getAddress();}MetricQueryServiceGateway定义了两个方法，一个是queryMetrics，一个是getAddress；它有一个实现类为AkkaQueryServiceGatewayAkkaQueryServiceGatewayflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/webmonitor/retriever/impl/AkkaQueryServiceGateway.javapublic class AkkaQueryServiceGateway implements MetricQueryServiceGateway { private final ActorRef queryServiceActorRef; public AkkaQueryServiceGateway(ActorRef queryServiceActorRef) { this.queryServiceActorRef = Preconditions.checkNotNull(queryServiceActorRef); } @Override public CompletableFuture<MetricDumpSerialization.MetricSerializationResult> queryMetrics(Time timeout) { return FutureUtils.toJava( Patterns.ask(queryServiceActorRef, MetricQueryService.getCreateDump(), timeout.toMilliseconds()) .mapTo(ClassTag$.MODULE$.apply(MetricDumpSerialization.MetricSerializationResult.class)) ); } @Override public String getAddress() { return queryServiceActorRef.path().toString(); }}AkkaQueryServiceGateway实现了MetricQueryServiceGateway接口，它的构造器要求传入queryServiceActorRef；queryMetrics方法ask的消息类型为MetricQueryService.CreateDump；getAddress方法返回的是queryServiceActorRef.path()MetricQueryServiceflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/metrics/dump/MetricQueryService.javapublic class MetricQueryService extends UntypedActor { private static final Logger LOG = LoggerFactory.getLogger(MetricQueryService.class); public static final String METRIC_QUERY_SERVICE_NAME = “MetricQueryService”; private static final String SIZE_EXCEEDED_LOG_TEMPLATE = “{} will not be reported as the metric dump would exceed the maximum size of {} bytes.”; private static final CharacterFilter FILTER = new CharacterFilter() { @Override public String filterCharacters(String input) { return replaceInvalidChars(input); } }; private final MetricDumpSerializer serializer = new MetricDumpSerializer(); private final Map<Gauge<?>, Tuple2<QueryScopeInfo, String>> gauges = new HashMap<>(); private final Map<Counter, Tuple2<QueryScopeInfo, String>> counters = new HashMap<>(); private final Map<Histogram, Tuple2<QueryScopeInfo, String>> histograms = new HashMap<>(); private final Map<Meter, Tuple2<QueryScopeInfo, String>> meters = new HashMap<>(); private final long messageSizeLimit; //…… @Override public void onReceive(Object message) { try { if (message instanceof AddMetric) { AddMetric added = (AddMetric) message; String metricName = added.metricName; Metric metric = added.metric; AbstractMetricGroup group = added.group; QueryScopeInfo info = group.getQueryServiceMetricInfo(FILTER); if (metric instanceof Counter) { counters.put((Counter) metric, new Tuple2<>(info, FILTER.filterCharacters(metricName))); } else if (metric instanceof Gauge) { gauges.put((Gauge<?>) metric, new Tuple2<>(info, FILTER.filterCharacters(metricName))); } else if (metric instanceof Histogram) { histograms.put((Histogram) metric, new Tuple2<>(info, FILTER.filterCharacters(metricName))); } else if (metric instanceof Meter) { meters.put((Meter) metric, new Tuple2<>(info, FILTER.filterCharacters(metricName))); } } else if (message instanceof RemoveMetric) { Metric metric = (((RemoveMetric) message).metric); if (metric instanceof Counter) { this.counters.remove(metric); } else if (metric instanceof Gauge) { this.gauges.remove(metric); } else if (metric instanceof Histogram) { this.histograms.remove(metric); } else if (metric instanceof Meter) { this.meters.remove(metric); } } else if (message instanceof CreateDump) { MetricDumpSerialization.MetricSerializationResult dump = serializer.serialize(counters, gauges, histograms, meters); dump = enforceSizeLimit(dump); getSender().tell(dump, getSelf()); } else { LOG.warn(“MetricQueryServiceActor received an invalid message. " + message.toString()); getSender().tell(new Status.Failure(new IOException(“MetricQueryServiceActor received an invalid message. " + message.toString())), getSelf()); } } catch (Exception e) { LOG.warn(“An exception occurred while processing a message.”, e); } } public static Object getCreateDump() { return CreateDump.INSTANCE; } private static class CreateDump implements Serializable { private static final CreateDump INSTANCE = new CreateDump(); } //……}MetricQueryService继承了UntypedActor，它的onReceive方法判断message类型，如果为CreateDump的话，则调用MetricDumpSerialization.MetricDumpSerializer.serialize(counters, gauges, histograms, meters)方法来序列化metrics得到MetricDumpSerialization.MetricSerializationResult，然后使用getSender().tell(dump, getSelf())返回数据MetricDumpSerializationflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/metrics/dump/MetricDumpSerialization.javapublic class MetricDumpSerialization { //…… public static class MetricSerializationResult implements Serializable { private static final long serialVersionUID = 6928770855951536906L; public final byte[] serializedCounters; public final byte[] serializedGauges; public final byte[] serializedMeters; public final byte[] serializedHistograms; public final int numCounters; public final int numGauges; public final int numMeters; public final int numHistograms; public MetricSerializationResult( byte[] serializedCounters, byte[] serializedGauges, byte[] serializedMeters, byte[] serializedHistograms, int numCounters, int numGauges, int numMeters, int numHistograms) { Preconditions.checkNotNull(serializedCounters); Preconditions.checkNotNull(serializedGauges); Preconditions.checkNotNull(serializedMeters); Preconditions.checkNotNull(serializedHistograms); Preconditions.checkArgument(numCounters >= 0); Preconditions.checkArgument(numGauges >= 0); Preconditions.checkArgument(numMeters >= 0); Preconditions.checkArgument(numHistograms >= 0); this.serializedCounters = serializedCounters; this.serializedGauges = serializedGauges; this.serializedMeters = serializedMeters; this.serializedHistograms = serializedHistograms; this.numCounters = numCounters; this.numGauges = numGauges; this.numMeters = numMeters; this.numHistograms = numHistograms; } } public static class MetricDumpSerializer { private DataOutputSerializer countersBuffer = new DataOutputSerializer(1024 * 8); private DataOutputSerializer gaugesBuffer = new DataOutputSerializer(1024 * 8); private DataOutputSerializer metersBuffer = new DataOutputSerializer(1024 * 8); private DataOutputSerializer histogramsBuffer = new DataOutputSerializer(1024 * 8); public MetricSerializationResult serialize( Map<Counter, Tuple2<QueryScopeInfo, String>> counters, Map<Gauge<?>, Tuple2<QueryScopeInfo, String>> gauges, Map<Histogram, Tuple2<QueryScopeInfo, String>> histograms, Map<Meter, Tuple2<QueryScopeInfo, String>> meters) { countersBuffer.clear(); int numCounters = 0; for (Map.Entry<Counter, Tuple2<QueryScopeInfo, String>> entry : counters.entrySet()) { try { serializeCounter(countersBuffer, entry.getValue().f0, entry.getValue().f1, entry.getKey()); numCounters++; } catch (Exception e) { LOG.debug(“Failed to serialize counter.”, e); } } gaugesBuffer.clear(); int numGauges = 0; for (Map.Entry<Gauge<?>, Tuple2<QueryScopeInfo, String>> entry : gauges.entrySet()) { try { serializeGauge(gaugesBuffer, entry.getValue().f0, entry.getValue().f1, entry.getKey()); numGauges++; } catch (Exception e) { LOG.debug(“Failed to serialize gauge.”, e); } } histogramsBuffer.clear(); int numHistograms = 0; for (Map.Entry<Histogram, Tuple2<QueryScopeInfo, String>> entry : histograms.entrySet()) { try { serializeHistogram(histogramsBuffer, entry.getValue().f0, entry.getValue().f1, entry.getKey()); numHistograms++; } catch (Exception e) { LOG.debug(“Failed to serialize histogram.”, e); } } metersBuffer.clear(); int numMeters = 0; for (Map.Entry<Meter, Tuple2<QueryScopeInfo, String>> entry : meters.entrySet()) { try { serializeMeter(metersBuffer, entry.getValue().f0, entry.getValue().f1, entry.getKey()); numMeters++; } catch (Exception e) { LOG.debug(“Failed to serialize meter.”, e); } } return new MetricSerializationResult( countersBuffer.getCopyOfBuffer(), gaugesBuffer.getCopyOfBuffer(), metersBuffer.getCopyOfBuffer(), histogramsBuffer.getCopyOfBuffer(), numCounters, numGauges, numMeters, numHistograms); } public void close() { countersBuffer = null; gaugesBuffer = null; metersBuffer = null; histogramsBuffer = null; } } //……}MetricDumpSerialization有几个静态类分别是MetricSerializationResult、MetricDumpSerializer、MetricDumpDeserializer；MetricDumpSerializer提供了serialize方法用于将counters、gauges、histograms、meters指标序列化为MetricSerializationResult小结MetricQueryServiceGateway定义了两个方法，一个是queryMetrics，一个是getAddress；它有一个实现类为AkkaQueryServiceGatewayAkkaQueryServiceGateway实现了MetricQueryServiceGateway接口，它的构造器要求传入queryServiceActorRef；queryMetrics方法ask的消息类型为MetricQueryService.CreateDump；getAddress方法返回的是queryServiceActorRef.path()MetricQueryService继承了UntypedActor，它的onReceive方法判断message类型，如果为CreateDump的话，则调用MetricDumpSerialization.MetricDumpSerializer.serialize(counters, gauges, histograms, meters)方法来序列化metrics得到MetricDumpSerialization.MetricSerializationResult，然后使用getSender().tell(dump, getSelf())返回数据；MetricDumpSerialization有几个静态类分别是MetricSerializationResult、MetricDumpSerializer、MetricDumpDeserializer；MetricDumpSerializer提供了serialize方法用于将counters、gauges、histograms、meters指标序列化为MetricSerializationResultdocMetricQueryServiceGateway ...

聊聊flink的ActorGateway

序本文主要研究一下flink的ActorGatewayActorGatewayflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/instance/ActorGateway.javapublic interface ActorGateway extends Serializable { /** * Sends a message asynchronously and returns its response. The response to the message is * returned as a future. * * @param message Message to be sent * @param timeout Timeout until the Future is completed with an AskTimeoutException * @return Future which contains the response to the sent message / Future<Object> ask(Object message, FiniteDuration timeout); /* * Sends a message asynchronously without a result. * * @param message Message to be sent / void tell(Object message); /* * Sends a message asynchronously without a result with sender being the sender. * * @param message Message to be sent * @param sender Sender of the message / void tell(Object message, ActorGateway sender); /* * Forwards a message. For the receiver of this message it looks as if sender has sent the * message. * * @param message Message to be sent * @param sender Sender of the forwarded message / void forward(Object message, ActorGateway sender); /* * Retries to send asynchronously a message up to numberRetries times. The response to this * message is returned as a future. The message is re-sent if the number of retries is not yet * exceeded and if an exception occurred while sending it. * * @param message Message to be sent * @param numberRetries Number of times to retry sending the message * @param timeout Timeout for each sending attempt * @param executionContext ExecutionContext which is used to send the message multiple times * @return Future of the response to the sent message / Future<Object> retry( Object message, int numberRetries, FiniteDuration timeout, ExecutionContext executionContext); /* * Returns the path of the remote instance. * * @return Path of the remote instance. / String path(); /* * Returns the underlying actor with which is communicated * * @return ActorRef of the target actor / ActorRef actor(); /* * Returns the leaderSessionID associated with the remote actor or null. * * @return Leader session ID if its associated with this gateway, otherwise null / UUID leaderSessionID();}ActorGateway接口定义了ask、tell、forward、retry、path、actor、leaderSessionID方法；它有一个实现类为AkkaActorGatewayAkkaActorGatewayflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/instance/AkkaActorGateway.javapublic class AkkaActorGateway implements ActorGateway, Serializable { private static final long serialVersionUID = 42L; // ActorRef of the remote instance private final ActorRef actor; // Associated leader session ID, which is used for RequiresLeaderSessionID messages private final UUID leaderSessionID; // Decorator for messages private final MessageDecorator decorator; public AkkaActorGateway(ActorRef actor, UUID leaderSessionID) { this.actor = Preconditions.checkNotNull(actor); this.leaderSessionID = Preconditions.checkNotNull(leaderSessionID); // we want to wrap RequiresLeaderSessionID messages in a LeaderSessionMessage this.decorator = new LeaderSessionMessageDecorator(leaderSessionID); } /* * Sends a message asynchronously and returns its response. The response to the message is * returned as a future. * * @param message Message to be sent * @param timeout Timeout until the Future is completed with an AskTimeoutException * @return Future which contains the response to the sent message / @Override public Future<Object> ask(Object message, FiniteDuration timeout) { Object newMessage = decorator.decorate(message); return Patterns.ask(actor, newMessage, new Timeout(timeout)); } /* * Sends a message asynchronously without a result. * * @param message Message to be sent / @Override public void tell(Object message) { Object newMessage = decorator.decorate(message); actor.tell(newMessage, ActorRef.noSender()); } /* * Sends a message asynchronously without a result with sender being the sender. * * @param message Message to be sent * @param sender Sender of the message / @Override public void tell(Object message, ActorGateway sender) { Object newMessage = decorator.decorate(message); actor.tell(newMessage, sender.actor()); } /* * Forwards a message. For the receiver of this message it looks as if sender has sent the * message. * * @param message Message to be sent * @param sender Sender of the forwarded message / @Override public void forward(Object message, ActorGateway sender) { Object newMessage = decorator.decorate(message); actor.tell(newMessage, sender.actor()); } /* * Retries to send asynchronously a message up to numberRetries times. The response to this * message is returned as a future. The message is re-sent if the number of retries is not yet * exceeded and if an exception occurred while sending it. * * @param message Message to be sent * @param numberRetries Number of times to retry sending the message * @param timeout Timeout for each sending attempt * @param executionContext ExecutionContext which is used to send the message multiple times * @return Future of the response to the sent message / @Override public Future<Object> retry( Object message, int numberRetries, FiniteDuration timeout, ExecutionContext executionContext) { Object newMessage = decorator.decorate(message); return AkkaUtils.retry( actor, newMessage, numberRetries, executionContext, timeout); } /* * Returns the ActorPath of the remote instance. * * @return ActorPath of the remote instance. / @Override public String path() { return actor.path().toString(); } /* * Returns {@link ActorRef} of the target actor * * @return ActorRef of the target actor / @Override public ActorRef actor() { return actor; } @Override public UUID leaderSessionID() { return leaderSessionID; } @Override public String toString() { return String.format(“AkkaActorGateway(%s, %s)”, actor.path(), leaderSessionID); }}AkkaActorGateway实现了ActorGateway接口，它的构造器要求输入ActorRef及leaderSessionID，同时基于leaderSessionID创建了LeaderSessionMessageDecorator；ask、tell、forward、retry方法均首先调用LeaderSessionMessageDecorator.decorate方法包装message参数，然后再去调用ActorRef的相应方法MessageDecoratorflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/messages/MessageDecorator.javapublic interface MessageDecorator extends java.io.Serializable { /* * Decorates a message * * @param message Message to decorate * @return Decorated message / Object decorate(Object message);}MessageDecorator接口定义了decorate方法用于包装message，它有一个实现类为LeaderSessionMessageDecoratorLeaderSessionMessageDecoratorflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/messages/LeaderSessionMessageDecorator.javapublic class LeaderSessionMessageDecorator implements MessageDecorator { private static final long serialVersionUID = 5359618147408392706L; /* Leader session ID with which the RequiresLeaderSessionID messages will be decorated / private final UUID leaderSessionID; /* * Sets the leader session ID with which the messages will be decorated. * * @param leaderSessionID Leader session ID to be used for decoration / public LeaderSessionMessageDecorator(UUID leaderSessionID) { this.leaderSessionID = leaderSessionID; } @Override public Object decorate(Object message) { if (message instanceof RequiresLeaderSessionID) { return new JobManagerMessages.LeaderSessionMessage(leaderSessionID, message); } else { return message; } }}LeaderSessionMessageDecorator实现了MessageDecorator接口，其decorate方法判断message是RequiresLeaderSessionID类型的话，则返回JobManagerMessages.LeaderSessionMessage，否则返回原始的messageJobManagerMessages.LeaderSessionMessageflink-1.7.2/flink-runtime/src/main/scala/org/apache/flink/runtime/messages/JobManagerMessages.scalaobject JobManagerMessages { /* Wrapper class for leader session messages. Leader session messages implement the * [[RequiresLeaderSessionID]] interface and have to be wrapped in a [[LeaderSessionMessage]], * which also contains the current leader session ID. * * @param leaderSessionID Current leader session ID * @param message [[RequiresLeaderSessionID]] message to be wrapped in a [[LeaderSessionMessage]] */ case class LeaderSessionMessage(leaderSessionID: UUID, message: Any) //……}JobManagerMessages.LeaderSessionMessage是一个case class，它有两个属性，分别是leaderSessionID及message小结ActorGateway接口定义了ask、tell、forward、retry、path、actor、leaderSessionID方法；它有一个实现类为AkkaActorGatewayAkkaActorGateway实现了ActorGateway接口，它的构造器要求输入ActorRef及leaderSessionID，同时基于leaderSessionID创建了LeaderSessionMessageDecorator；ask、tell、forward、retry方法均首先调用LeaderSessionMessageDecorator.decorate方法包装message参数，然后再去调用ActorRef的相应方法MessageDecorator接口定义了decorate方法用于包装message，它有一个实现类为LeaderSessionMessageDecorator；LeaderSessionMessageDecorator实现了MessageDecorator接口，其decorate方法判断message是RequiresLeaderSessionID类型的话，则返回JobManagerMessages.LeaderSessionMessage，否则返回原始的message；JobManagerMessages.LeaderSessionMessage是一个case class，它有两个属性，分别是leaderSessionID及messagedocActorGateway ...

聊聊flink的FencedAkkaInvocationHandler

序本文主要研究一下flink的FencedAkkaInvocationHandlerFencedRpcGatewayflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/FencedRpcGateway.javapublic interface FencedRpcGateway<F extends Serializable> extends RpcGateway { /** * Get the current fencing token. * * @return current fencing token / F getFencingToken();}FencedRpcGateway接口继承了RpcGateway接口，它定义一个泛型F，即为fencing token的泛型FencedMainThreadExecutableflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/FencedMainThreadExecutable.javapublic interface FencedMainThreadExecutable extends MainThreadExecutable { /* * Run the given runnable in the main thread without attaching a fencing token. * * @param runnable to run in the main thread without validating the fencing token. / void runAsyncWithoutFencing(Runnable runnable); /* * Run the given callable in the main thread without attaching a fencing token. * * @param callable to run in the main thread without validating the fencing token. * @param timeout for the operation * @param <V> type of the callable result * @return Future containing the callable result */ <V> CompletableFuture<V> callAsyncWithoutFencing(Callable<V> callable, Time timeout);}FencedMainThreadExecutable接口继承了MainThreadExecutable，它定义了runAsyncWithoutFencing、callAsyncWithoutFencing方法用于运行unfenced runnable或者unfenced callable，之所以这样定义主要是因为FencedMainThreadExecutable继承了MainThreadExecutable，因而MainThreadExecutable里头定义的runAsync、callAsync、scheduleRunAsync方法的语义就变成是FencedFencedAkkaInvocationHandlerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/akka/FencedAkkaInvocationHandler.javapublic class FencedAkkaInvocationHandler<F extends Serializable> extends AkkaInvocationHandler implements FencedMainThreadExecutable, FencedRpcGateway<F> { private final Supplier<F> fencingTokenSupplier; public FencedAkkaInvocationHandler( String address, String hostname, ActorRef rpcEndpoint, Time timeout, long maximumFramesize, @Nullable CompletableFuture<Void> terminationFuture, Supplier<F> fencingTokenSupplier) { super(address, hostname, rpcEndpoint, timeout, maximumFramesize, terminationFuture); this.fencingTokenSupplier = Preconditions.checkNotNull(fencingTokenSupplier); } @Override public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { Class<?> declaringClass = method.getDeclaringClass(); if (declaringClass.equals(FencedMainThreadExecutable.class) || declaringClass.equals(FencedRpcGateway.class)) { return method.invoke(this, args); } else { return super.invoke(proxy, method, args); } } @Override public void runAsyncWithoutFencing(Runnable runnable) { checkNotNull(runnable, “runnable”); if (isLocal) { getActorRef().tell( new UnfencedMessage<>(new RunAsync(runnable, 0L)), ActorRef.noSender()); } else { throw new RuntimeException(“Trying to send a Runnable to a remote actor at " + getActorRef().path() + “. This is not supported.”); } } @Override public <V> CompletableFuture<V> callAsyncWithoutFencing(Callable<V> callable, Time timeout) { checkNotNull(callable, “callable”); checkNotNull(timeout, “timeout”); if (isLocal) { @SuppressWarnings(“unchecked”) CompletableFuture<V> resultFuture = (CompletableFuture<V>) FutureUtils.toJava( Patterns.ask( getActorRef(), new UnfencedMessage<>(new CallAsync(callable)), timeout.toMilliseconds())); return resultFuture; } else { throw new RuntimeException(“Trying to send a Runnable to a remote actor at " + getActorRef().path() + “. This is not supported.”); } } @Override public void tell(Object message) { super.tell(fenceMessage(message)); } @Override public CompletableFuture<?> ask(Object message, Time timeout) { return super.ask(fenceMessage(message), timeout); } @Override public F getFencingToken() { return fencingTokenSupplier.get(); } private FencedMessage<F, P> fenceMessage(P message) { if (isLocal) { return new LocalFencedMessage<>(fencingTokenSupplier.get(), message); } else { if (message instanceof Serializable) { @SuppressWarnings(“unchecked”) FencedMessage<F, P> result = (FencedMessage<F, P>) new RemoteFencedMessage<>(fencingTokenSupplier.get(), (Serializable) message); return result; } else { throw new RuntimeException(“Trying to send a non-serializable message " + message + " to a remote " + “RpcEndpoint. Please make sure that the message implements java.io.Serializable.”); } } }}FencedAkkaInvocationHandler继承了AkkaInvocationHandler，实现了FencedMainThreadExecutable、FencedRpcGateway接口；runAsyncWithoutFencing、callAsyncWithoutFencing发送的均为UnfencedMessageFencedAkkaInvocationHandler的invoke方法针对FencedRpcGateway、FencedMainThreadExecutable的方法则对当前对象进行对应方法调用，其他的就转为调用父类的invoke方法父类的runAsync、scheduleRunAsync、callAsync最后调用的是tell或者ask方法，而FencedAkkaInvocationHandler覆盖了父类的tell及ask方法，将runAsync、scheduleRunAsync、callAsync方法的语义变为Fenced；这里的tell及ask方法通过fenceMessage方法构造FencedMessage，而fenceMessage方法通过getFencingToken方法获取fencing token；getFencingToken方法调用的是fencingTokenSupplier.get()，fencingTokenSupplier由构造器传入UnfencedMessageflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/messages/UnfencedMessage.javapublic class UnfencedMessage { private final P payload; public UnfencedMessage(P payload) { this.payload = Preconditions.checkNotNull(payload); } public P getPayload() { return payload; } @Override public String toString() { return “UnfencedMessage(” + payload + ‘)’; }}UnfencedMessage即不需要fencing token的messageFencedMessageflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/messages/FencedMessage.javapublic interface FencedMessage<F extends Serializable, P> { F getFencingToken(); P getPayload();}FencedMessage接口定义了getFencingToken及getPayload方法；它有两个子类，分别是LocalFencedMessage及RemoteFencedMessageLocalFencedMessageflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/messages/LocalFencedMessage.javapublic class LocalFencedMessage<F extends Serializable, P> implements FencedMessage<F, P> { private final F fencingToken; private final P payload; public LocalFencedMessage(@Nullable F fencingToken, P payload) { this.fencingToken = fencingToken; this.payload = Preconditions.checkNotNull(payload); } @Override public F getFencingToken() { return fencingToken; } @Override public P getPayload() { return payload; } @Override public String toString() { return “LocalFencedMessage(” + fencingToken + “, " + payload + ‘)’; }}LocalFencedMessage实现了FencedMessage接口，其中fencingToken的类型要求实现Serializable接口，它有fencingToken及payload两个属性RemoteFencedMessageflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/messages/RemoteFencedMessage.javapublic class RemoteFencedMessage<F extends Serializable, P extends Serializable> implements FencedMessage<F, P>, Serializable { private static final long serialVersionUID = 4043136067468477742L; private final F fencingToken; private final P payload; public RemoteFencedMessage(@Nullable F fencingToken, P payload) { this.fencingToken = fencingToken; this.payload = Preconditions.checkNotNull(payload); } @Override public F getFencingToken() { return fencingToken; } @Override public P getPayload() { return payload; } @Override public String toString() { return “RemoteFencedMessage(” + fencingToken + “, " + payload + ‘)’; }}RemoteFencedMessage实现了FencedMessage及Serializable接口，同时payload的类型也要求实现Serializable接口，它有fencingToken及payload两个属性小结FencedRpcGateway接口继承了RpcGateway接口，它定义一个泛型F，即为fencing token的泛型；FencedMainThreadExecutable接口继承了MainThreadExecutable，它定义了runAsyncWithoutFencing、callAsyncWithoutFencing方法用于运行unfenced runnable或者unfenced callableFencedAkkaInvocationHandler继承了AkkaInvocationHandler，实现了FencedMainThreadExecutable、FencedRpcGateway接口；runAsyncWithoutFencing、callAsyncWithoutFencing发送的均为UnfencedMessage；父类的runAsync、scheduleRunAsync、callAsync最后调用的是tell或者ask方法，而FencedAkkaInvocationHandler覆盖了父类的tell及ask方法，将runAsync、scheduleRunAsync、callAsync方法的语义变为Fenced；这里的tell及ask方法通过fenceMessage方法构造FencedMessageUnfencedMessage即不需要fencing token的message；FencedMessage接口定义了getFencingToken及getPayload方法；它有两个子类，分别是LocalFencedMessage及RemoteFencedMessage；LocalFencedMessage与RemoteFencedMessage的区别在于RemoteFencedMessage实现了Serializable接口，同时payload的类型也要求实现Serializable接口docFencedAkkaInvocationHandler ...

聊聊flink的RpcServer

序本文主要研究一下flink的RpcServerRpcGatewayflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/RpcGateway.javapublic interface RpcGateway { /** * Returns the fully qualified address under which the associated rpc endpoint is reachable. * * @return Fully qualified (RPC) address under which the associated rpc endpoint is reachable / String getAddress(); /* * Returns the fully qualified hostname under which the associated rpc endpoint is reachable. * * @return Fully qualified hostname under which the associated rpc endpoint is reachable / String getHostname();}RpcGateway定义了getAddress、getHostname两个方法MainThreadExecutableflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/MainThreadExecutable.javapublic interface MainThreadExecutable { /* * Execute the runnable in the main thread of the underlying RPC endpoint. * * @param runnable Runnable to be executed / void runAsync(Runnable runnable); /* * Execute the callable in the main thread of the underlying RPC endpoint and return a future for * the callable result. If the future is not completed within the given timeout, the returned * future will throw a {@link TimeoutException}. * * @param callable Callable to be executed * @param callTimeout Timeout for the future to complete * @param <V> Return value of the callable * @return Future of the callable result / <V> CompletableFuture<V> callAsync(Callable<V> callable, Time callTimeout); /* * Execute the runnable in the main thread of the underlying RPC endpoint, with * a delay of the given number of milliseconds. * * @param runnable Runnable to be executed * @param delay The delay, in milliseconds, after which the runnable will be executed / void scheduleRunAsync(Runnable runnable, long delay);}MainThreadExecutable定义了runAsync、callAsync、scheduleRunAsync三个方法StartStoppableflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/StartStoppable.javapublic interface StartStoppable { /* * Starts the processing of remote procedure calls. / void start(); /* * Stops the processing of remote procedure calls. / void stop();}StartStoppable定义了start、stop方法RpcServerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/RpcServer.javapublic interface RpcServer extends StartStoppable, MainThreadExecutable, RpcGateway { /* * Return a future which is completed when the rpc endpoint has been terminated. * * @return Future indicating when the rpc endpoint has been terminated / CompletableFuture<Void> getTerminationFuture();}RpcServer接口继承了RpcGateway、MainThreadExecutable、StartStoppable三个接口，另外定义了getTerminationFuture方法；它有两个实现类，分别是AkkaInvocationHandler、FencedAkkaInvocationHandler；其中FencedAkkaInvocationHandler继承了AkkaInvocationHandlerAkkaBasedEndpointflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/akka/AkkaBasedEndpoint.javainterface AkkaBasedEndpoint extends RpcGateway { /* * Returns the {@link ActorRef} of the underlying RPC actor. * * @return the {@link ActorRef} of the underlying RPC actor / ActorRef getActorRef();}AkkaBasedEndpoint接口继承了RpcGateway接口，它另外定义了getActorRef()方法用于获取ActorRefAkkaInvocationHandlerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/akka/AkkaInvocationHandler.javaclass AkkaInvocationHandler implements InvocationHandler, AkkaBasedEndpoint, RpcServer { private static final Logger LOG = LoggerFactory.getLogger(AkkaInvocationHandler.class); /* * The Akka (RPC) address of {@link #rpcEndpoint} including host and port of the ActorSystem in * which the actor is running. / private final String address; /* * Hostname of the host, {@link #rpcEndpoint} is running on. */ private final String hostname; private final ActorRef rpcEndpoint; // whether the actor ref is local and thus no message serialization is needed protected final boolean isLocal; // default timeout for asks private final Time timeout; private final long maximumFramesize; // null if gateway; otherwise non-null @Nullable private final CompletableFuture<Void> terminationFuture; AkkaInvocationHandler( String address, String hostname, ActorRef rpcEndpoint, Time timeout, long maximumFramesize, @Nullable CompletableFuture<Void> terminationFuture) { this.address = Preconditions.checkNotNull(address); this.hostname = Preconditions.checkNotNull(hostname); this.rpcEndpoint = Preconditions.checkNotNull(rpcEndpoint); this.isLocal = this.rpcEndpoint.path().address().hasLocalScope(); this.timeout = Preconditions.checkNotNull(timeout); this.maximumFramesize = maximumFramesize; this.terminationFuture = terminationFuture; } @Override public Object invoke(Object proxy, Method method, Object[] args) throws Throwable { Class<?> declaringClass = method.getDeclaringClass(); Object result; if (declaringClass.equals(AkkaBasedEndpoint.class) || declaringClass.equals(Object.class) || declaringClass.equals(RpcGateway.class) || declaringClass.equals(StartStoppable.class) || declaringClass.equals(MainThreadExecutable.class) || declaringClass.equals(RpcServer.class)) { result = method.invoke(this, args); } else if (declaringClass.equals(FencedRpcGateway.class)) { throw new UnsupportedOperationException(“AkkaInvocationHandler does not support the call FencedRpcGateway#” + method.getName() + “. This indicates that you retrieved a FencedRpcGateway without specifying a " + “fencing token. Please use RpcService#connect(RpcService, F, Time) with F being the fencing token to " + “retrieve a properly FencedRpcGateway.”); } else { result = invokeRpc(method, args); } return result; } @Override public ActorRef getActorRef() { return rpcEndpoint; } @Override public void runAsync(Runnable runnable) { scheduleRunAsync(runnable, 0L); } @Override public void scheduleRunAsync(Runnable runnable, long delayMillis) { checkNotNull(runnable, “runnable”); checkArgument(delayMillis >= 0, “delay must be zero or greater”); if (isLocal) { long atTimeNanos = delayMillis == 0 ? 0 : System.nanoTime() + (delayMillis * 1_000_000); tell(new RunAsync(runnable, atTimeNanos)); } else { throw new RuntimeException(“Trying to send a Runnable to a remote actor at " + rpcEndpoint.path() + “. This is not supported.”); } } @Override public <V> CompletableFuture<V> callAsync(Callable<V> callable, Time callTimeout) { if (isLocal) { @SuppressWarnings(“unchecked”) CompletableFuture<V> resultFuture = (CompletableFuture<V>) ask(new CallAsync(callable), callTimeout); return resultFuture; } else { throw new RuntimeException(“Trying to send a Callable to a remote actor at " + rpcEndpoint.path() + “. This is not supported.”); } } @Override public void start() { rpcEndpoint.tell(Processing.START, ActorRef.noSender()); } @Override public void stop() { rpcEndpoint.tell(Processing.STOP, ActorRef.noSender()); } // ———————————————————————— // Private methods // ———————————————————————— private Object invokeRpc(Method method, Object[] args) throws Exception { String methodName = method.getName(); Class<?>[] parameterTypes = method.getParameterTypes(); Annotation[][] parameterAnnotations = method.getParameterAnnotations(); Time futureTimeout = extractRpcTimeout(parameterAnnotations, args, timeout); final RpcInvocation rpcInvocation = createRpcInvocationMessage(methodName, parameterTypes, args); Class<?> returnType = method.getReturnType(); final Object result; if (Objects.equals(returnType, Void.TYPE)) { tell(rpcInvocation); result = null; } else if (Objects.equals(returnType, CompletableFuture.class)) { // execute an asynchronous call result = ask(rpcInvocation, futureTimeout); } else { // execute a synchronous call CompletableFuture<?> futureResult = ask(rpcInvocation, futureTimeout); result = futureResult.get(futureTimeout.getSize(), futureTimeout.getUnit()); } return result; } protected RpcInvocation createRpcInvocationMessage( final String methodName, final Class<?>[] parameterTypes, final Object[] args) throws IOException { final RpcInvocation rpcInvocation; if (isLocal) { rpcInvocation = new LocalRpcInvocation( methodName, parameterTypes, args); } else { try { RemoteRpcInvocation remoteRpcInvocation = new RemoteRpcInvocation( methodName, parameterTypes, args); if (remoteRpcInvocation.getSize() > maximumFramesize) { throw new IOException(“The rpc invocation size exceeds the maximum akka framesize.”); } else { rpcInvocation = remoteRpcInvocation; } } catch (IOException e) { LOG.warn(“Could not create remote rpc invocation message. Failing rpc invocation because…”, e); throw e; } } return rpcInvocation; } // ———————————————————————— // Helper methods // ———————————————————————— private static Time extractRpcTimeout(Annotation[][] parameterAnnotations, Object[] args, Time defaultTimeout) { if (args != null) { Preconditions.checkArgument(parameterAnnotations.length == args.length); for (int i = 0; i < parameterAnnotations.length; i++) { if (isRpcTimeout(parameterAnnotations[i])) { if (args[i] instanceof Time) { return (Time) args[i]; } else { throw new RuntimeException(“The rpc timeout parameter must be of type " + Time.class.getName() + “. The type " + args[i].getClass().getName() + " is not supported.”); } } } } return defaultTimeout; } private static boolean isRpcTimeout(Annotation[] annotations) { for (Annotation annotation : annotations) { if (annotation.annotationType().equals(RpcTimeout.class)) { return true; } } return false; } protected void tell(Object message) { rpcEndpoint.tell(message, ActorRef.noSender()); } protected CompletableFuture<?> ask(Object message, Time timeout) { return FutureUtils.toJava( Patterns.ask(rpcEndpoint, message, timeout.toMilliseconds())); } @Override public String getAddress() { return address; } @Override public String getHostname() { return hostname; } @Override public CompletableFuture<Void> getTerminationFuture() { return terminationFuture; }}AkkaInvocationHandler实现了RpcServer、AkkaBasedEndpoint、jdk的InvocationHandler接口；其构造器要求输入address、hostname、rpcEndpoint(ActorRef)、terminationFuture；getAddress、getHostname、getTerminationFuture均直接返回对应的属性runAsync方法内部调用的是scheduleRunAsync；scheduleRunAsync方法使用的是tell方法，调用rpcEndpoint.tell传递RunAsync消息；callAsync方法使用的是ask方法，调用Patterns.ask，传递CallAsync消息start方法执行rpcEndpoint.tell(Processing.START, ActorRef.noSender())；stop方法执行rpcEndpoint.tell(Processing.STOP, ActorRef.noSender())；invoke方法针对Object、RpcGateway、MainThreadExecutable、StartStoppable、AkkaBasedEndpoint、RpcServer的方法则对当前对象进行对应方法调用，针对FencedRpcGateway的方法抛出UnsupportedOperationException，其余的方法则内部调用invokeRpc方法，构造RpcInvocation消息进行调用小结RpcServer接口继承了RpcGateway、MainThreadExecutable、StartStoppable三个接口，另外定义了getTerminationFuture方法；它有两个实现类，分别是AkkaInvocationHandler、FencedAkkaInvocationHandler；其中FencedAkkaInvocationHandler继承了AkkaInvocationHandlerAkkaInvocationHandler实现了RpcServer、AkkaBasedEndpoint、jdk的InvocationHandler接口；其构造器要求输入address、hostname、rpcEndpoint(ActorRef)、terminationFuture；getAddress、getHostname、getTerminationFuture均直接返回对应的属性；runAsync方法内部调用的是scheduleRunAsync；scheduleRunAsync方法使用的是tell方法，调用rpcEndpoint.tell传递RunAsync消息；callAsync方法使用的是ask方法，调用Patterns.ask，传递CallAsync消息AkkaInvocationHandler的start方法执行rpcEndpoint.tell(Processing.START, ActorRef.noSender())；stop方法执行rpcEndpoint.tell(Processing.STOP, ActorRef.noSender())；invoke方法针对Object、RpcGateway、MainThreadExecutable、StartStoppable、AkkaBasedEndpoint、RpcServer的方法则对当前对象进行对应方法调用，针对FencedRpcGateway的方法抛出UnsupportedOperationException，其余的方法则内部调用invokeRpc方法，构造RpcInvocation消息进行调用docRpcServer ...

聊聊flink的ScheduledExecutor

序本文主要研究一下flink的ScheduledExecutorExecutorjava.base/java/util/concurrent/Executor.javapublic interface Executor { /** * Executes the given command at some time in the future. The command * may execute in a new thread, in a pooled thread, or in the calling * thread, at the discretion of the {@code Executor} implementation. * * @param command the runnable task * @throws RejectedExecutionException if this task cannot be * accepted for execution * @throws NullPointerException if command is null / void execute(Runnable command);}jdk的Executor接口定义了execute方法，接收参数类型为RunnableScheduledExecutorflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/concurrent/ScheduledExecutor.javapublic interface ScheduledExecutor extends Executor { /* * Executes the given command after the given delay. * * @param command the task to execute in the future * @param delay the time from now to delay the execution * @param unit the time unit of the delay parameter * @return a ScheduledFuture representing the completion of the scheduled task / ScheduledFuture<?> schedule(Runnable command, long delay, TimeUnit unit); /* * Executes the given callable after the given delay. The result of the callable is returned * as a {@link ScheduledFuture}. * * @param callable the callable to execute * @param delay the time from now to delay the execution * @param unit the time unit of the delay parameter * @param <V> result type of the callable * @return a ScheduledFuture which holds the future value of the given callable / <V> ScheduledFuture<V> schedule(Callable<V> callable, long delay, TimeUnit unit); /* * Executes the given command periodically. The first execution is started after the * {@code initialDelay}, the second execution is started after {@code initialDelay + period}, * the third after {@code initialDelay + 2*period} and so on. * The task is executed until either an execution fails, or the returned {@link ScheduledFuture} * is cancelled. * * @param command the task to be executed periodically * @param initialDelay the time from now until the first execution is triggered * @param period the time after which the next execution is triggered * @param unit the time unit of the delay and period parameter * @return a ScheduledFuture representing the periodic task. This future never completes * unless an execution of the given task fails or if the future is cancelled / ScheduledFuture<?> scheduleAtFixedRate( Runnable command, long initialDelay, long period, TimeUnit unit); /* * Executed the given command repeatedly with the given delay between the end of an execution * and the start of the next execution. * The task is executed repeatedly until either an exception occurs or if the returned * {@link ScheduledFuture} is cancelled. * * @param command the task to execute repeatedly * @param initialDelay the time from now until the first execution is triggered * @param delay the time between the end of the current and the start of the next execution * @param unit the time unit of the initial delay and the delay parameter * @return a ScheduledFuture representing the repeatedly executed task. This future never * completes unless the execution of the given task fails or if the future is cancelled */ ScheduledFuture<?> scheduleWithFixedDelay( Runnable command, long initialDelay, long delay, TimeUnit unit);}ScheduledExecutor接口继承了Executor，它定义了schedule、scheduleAtFixedRate、scheduleWithFixedDelay方法，其中schedule方法可以接收Runnable或者Callable，这些方法返回的都是ScheduledFuture；该接口有两个实现类，分别是ScheduledExecutorServiceAdapter及ActorSystemScheduledExecutorAdapterScheduledExecutorServiceAdapterflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/concurrent/ScheduledExecutorServiceAdapter.javapublic class ScheduledExecutorServiceAdapter implements ScheduledExecutor { private final ScheduledExecutorService scheduledExecutorService; public ScheduledExecutorServiceAdapter(ScheduledExecutorService scheduledExecutorService) { this.scheduledExecutorService = Preconditions.checkNotNull(scheduledExecutorService); } @Override public ScheduledFuture<?> schedule(Runnable command, long delay, TimeUnit unit) { return scheduledExecutorService.schedule(command, delay, unit); } @Override public <V> ScheduledFuture<V> schedule(Callable<V> callable, long delay, TimeUnit unit) { return scheduledExecutorService.schedule(callable, delay, unit); } @Override public ScheduledFuture<?> scheduleAtFixedRate(Runnable command, long initialDelay, long period, TimeUnit unit) { return scheduledExecutorService.scheduleAtFixedRate(command, initialDelay, period, unit); } @Override public ScheduledFuture<?> scheduleWithFixedDelay(Runnable command, long initialDelay, long delay, TimeUnit unit) { return scheduledExecutorService.scheduleWithFixedDelay(command, initialDelay, delay, unit); } @Override public void execute(Runnable command) { scheduledExecutorService.execute(command); }}ScheduledExecutorServiceAdapter实现了ScheduledExecutor接口，它使用的是jdk的ScheduledExecutorService来实现，使用了scheduledExecutorService的schedule、scheduleAtFixedRate、scheduleWithFixedDelay、execute方法ActorSystemScheduledExecutorAdapterflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/concurrent/akka/ActorSystemScheduledExecutorAdapter.javapublic final class ActorSystemScheduledExecutorAdapter implements ScheduledExecutor { private final ActorSystem actorSystem; public ActorSystemScheduledExecutorAdapter(ActorSystem actorSystem) { this.actorSystem = Preconditions.checkNotNull(actorSystem, “rpcService”); } @Override @Nonnull public ScheduledFuture<?> schedule(@Nonnull Runnable command, long delay, @Nonnull TimeUnit unit) { ScheduledFutureTask<Void> scheduledFutureTask = new ScheduledFutureTask<>(command, unit.toNanos(delay), 0L); Cancellable cancellable = internalSchedule(scheduledFutureTask, delay, unit); scheduledFutureTask.setCancellable(cancellable); return scheduledFutureTask; } @Override @Nonnull public <V> ScheduledFuture<V> schedule(@Nonnull Callable<V> callable, long delay, @Nonnull TimeUnit unit) { ScheduledFutureTask<V> scheduledFutureTask = new ScheduledFutureTask<>(callable, unit.toNanos(delay), 0L); Cancellable cancellable = internalSchedule(scheduledFutureTask, delay, unit); scheduledFutureTask.setCancellable(cancellable); return scheduledFutureTask; } @Override @Nonnull public ScheduledFuture<?> scheduleAtFixedRate(@Nonnull Runnable command, long initialDelay, long period, @Nonnull TimeUnit unit) { ScheduledFutureTask<Void> scheduledFutureTask = new ScheduledFutureTask<>( command, triggerTime(unit.toNanos(initialDelay)), unit.toNanos(period)); Cancellable cancellable = actorSystem.scheduler().schedule( new FiniteDuration(initialDelay, unit), new FiniteDuration(period, unit), scheduledFutureTask, actorSystem.dispatcher()); scheduledFutureTask.setCancellable(cancellable); return scheduledFutureTask; } @Override @Nonnull public ScheduledFuture<?> scheduleWithFixedDelay(@Nonnull Runnable command, long initialDelay, long delay, @Nonnull TimeUnit unit) { ScheduledFutureTask<Void> scheduledFutureTask = new ScheduledFutureTask<>( command, triggerTime(unit.toNanos(initialDelay)), unit.toNanos(-delay)); Cancellable cancellable = internalSchedule(scheduledFutureTask, initialDelay, unit); scheduledFutureTask.setCancellable(cancellable); return scheduledFutureTask; } @Override public void execute(@Nonnull Runnable command) { actorSystem.dispatcher().execute(command); } private Cancellable internalSchedule(Runnable runnable, long delay, TimeUnit unit) { return actorSystem.scheduler().scheduleOnce( new FiniteDuration(delay, unit), runnable, actorSystem.dispatcher()); } private long now() { return System.nanoTime(); } private long triggerTime(long delay) { return now() + delay; } private final class ScheduledFutureTask<V> extends FutureTask<V> implements RunnableScheduledFuture<V> { private long time; private final long period; private volatile Cancellable cancellable; ScheduledFutureTask(Callable<V> callable, long time, long period) { super(callable); this.time = time; this.period = period; } ScheduledFutureTask(Runnable runnable, long time, long period) { super(runnable, null); this.time = time; this.period = period; } public void setCancellable(Cancellable newCancellable) { this.cancellable = newCancellable; } @Override public void run() { if (!isPeriodic()) { super.run(); } else if (runAndReset()){ if (period > 0L) { time += period; } else { cancellable = internalSchedule(this, -period, TimeUnit.NANOSECONDS); // check whether we have been cancelled concurrently if (isCancelled()) { cancellable.cancel(); } else { time = triggerTime(-period); } } } } @Override public boolean cancel(boolean mayInterruptIfRunning) { boolean result = super.cancel(mayInterruptIfRunning); return result && cancellable.cancel(); } @Override public long getDelay(@Nonnull TimeUnit unit) { return unit.convert(time - now(), TimeUnit.NANOSECONDS); } @Override public int compareTo(@Nonnull Delayed o) { if (o == this) { return 0; } long diff = getDelay(TimeUnit.NANOSECONDS) - o.getDelay(TimeUnit.NANOSECONDS); return (diff < 0L) ? -1 : (diff > 0L) ? 1 : 0; } @Override public boolean isPeriodic() { return period != 0L; } }}ActorSystemScheduledExecutorAdapter实现了ScheduledExecutor接口，它使用的是actorSystem来实现；其中execute方法使用的是actorSystem.dispatcher().execute方法schedule及scheduleWithFixedDelay方法调用的是internalSchedule方法，它使用的是actorSystem.scheduler().scheduleOnce方法，只是它们的ScheduledFutureTask不同，其中schedule方法的ScheduledFutureTask的period为0，而scheduleWithFixedDelay方法的ScheduledFutureTask的period为unit.toNanos(-delay)；ScheduledFutureTask的run方法会对period进行判断，小于等于0的，会再次调用internalSchedule方法，来实现以FixedDelay进行调度的效果scheduleAtFixedRate方法，它使用的是actorSystem.scheduler().schedule方法，其ScheduledFutureTask的period即为方法参数的period，没有像scheduleWithFixedDelay方法那样用unit.toNanos(-delay)作为period小结ScheduledExecutor接口继承了Executor，它定义了schedule、scheduleAtFixedRate、scheduleWithFixedDelay方法，其中schedule方法可以接收Runnable或者Callable，这些方法返回的都是ScheduledFuture；该接口有两个实现类，分别是ScheduledExecutorServiceAdapter及ActorSystemScheduledExecutorAdapterScheduledExecutorServiceAdapter实现了ScheduledExecutor接口，它使用的是jdk的ScheduledExecutorService来实现，使用了scheduledExecutorService的schedule、scheduleAtFixedRate、scheduleWithFixedDelay、execute方法ActorSystemScheduledExecutorAdapter实现了ScheduledExecutor接口，它使用的是actorSystem来实现；其中execute方法使用的是actorSystem.dispatcher().execute方法；schedule及scheduleWithFixedDelay方法调用的是internalSchedule方法，它使用的是actorSystem.scheduler().scheduleOnce方法，只是它们的ScheduledFutureTask不同，其中schedule方法的ScheduledFutureTask的period为0，而scheduleWithFixedDelay方法的ScheduledFutureTask的period为unit.toNanos(-delay)；ScheduledFutureTask的run方法会对period进行判断，小于等于0的，会再次调用internalSchedule方法，来实现以FixedDelay进行调度的效果；scheduleAtFixedRate方法，它使用的是actorSystem.scheduler().schedule方法，其ScheduledFutureTask的period即为方法参数的period，没有像scheduleWithFixedDelay方法那样用unit.toNanos(-delay)作为perioddocScheduledExecutorScheduledExecutorServiceAdapterActorSystemScheduledExecutorAdapter ...

聊聊flink的RpcService

序本文主要研究一下flink的RpcServiceRpcServiceflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/RpcService.javapublic interface RpcService { String getAddress(); int getPort(); <C extends RpcGateway> CompletableFuture<C> connect( String address, Class<C> clazz); <F extends Serializable, C extends FencedRpcGateway<F>> CompletableFuture<C> connect( String address, F fencingToken, Class<C> clazz); <C extends RpcEndpoint & RpcGateway> RpcServer startServer(C rpcEndpoint); <F extends Serializable> RpcServer fenceRpcServer(RpcServer rpcServer, F fencingToken); void stopServer(RpcServer selfGateway); CompletableFuture<Void> stopService(); CompletableFuture<Void> getTerminationFuture(); Executor getExecutor(); ScheduledExecutor getScheduledExecutor(); ScheduledFuture<?> scheduleRunnable(Runnable runnable, long delay, TimeUnit unit); void execute(Runnable runnable); <T> CompletableFuture<T> execute(Callable<T> callable);}RpcService用于连接到一个远程的rpc server，或者启动一个rpc server来转发远程调用到rpcEndpoint；它提供了connect、startServer、fenceRpcServer、stopServer、stopService、getTerminationFuture、scheduleRunnable、execute等方法AkkaRpcServiceflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/akka/AkkaRpcService.java@ThreadSafepublic class AkkaRpcService implements RpcService { private static final Logger LOG = LoggerFactory.getLogger(AkkaRpcService.class); static final int VERSION = 1; static final String MAXIMUM_FRAME_SIZE_PATH = “akka.remote.netty.tcp.maximum-frame-size”; private final Object lock = new Object(); private final ActorSystem actorSystem; private final Time timeout; @GuardedBy(“lock”) private final Map<ActorRef, RpcEndpoint> actors = new HashMap<>(4); private final long maximumFramesize; private final String address; private final int port; private final ScheduledExecutor internalScheduledExecutor; private final CompletableFuture<Void> terminationFuture; private volatile boolean stopped; public AkkaRpcService(final ActorSystem actorSystem, final Time timeout) { this.actorSystem = checkNotNull(actorSystem, “actor system”); this.timeout = checkNotNull(timeout, “timeout”); if (actorSystem.settings().config().hasPath(MAXIMUM_FRAME_SIZE_PATH)) { maximumFramesize = actorSystem.settings().config().getBytes(MAXIMUM_FRAME_SIZE_PATH); } else { // only local communication maximumFramesize = Long.MAX_VALUE; } Address actorSystemAddress = AkkaUtils.getAddress(actorSystem); if (actorSystemAddress.host().isDefined()) { address = actorSystemAddress.host().get(); } else { address = “”; } if (actorSystemAddress.port().isDefined()) { port = (Integer) actorSystemAddress.port().get(); } else { port = -1; } internalScheduledExecutor = new ActorSystemScheduledExecutorAdapter(actorSystem); terminationFuture = new CompletableFuture<>(); stopped = false; } public ActorSystem getActorSystem() { return actorSystem; } protected int getVersion() { return VERSION; } @Override public String getAddress() { return address; } @Override public int getPort() { return port; } // this method does not mutate state and is thus thread-safe @Override public <C extends RpcGateway> CompletableFuture<C> connect( final String address, final Class<C> clazz) { return connectInternal( address, clazz, (ActorRef actorRef) -> { Tuple2<String, String> addressHostname = extractAddressHostname(actorRef); return new AkkaInvocationHandler( addressHostname.f0, addressHostname.f1, actorRef, timeout, maximumFramesize, null); }); } // this method does not mutate state and is thus thread-safe @Override public <F extends Serializable, C extends FencedRpcGateway<F>> CompletableFuture<C> connect(String address, F fencingToken, Class<C> clazz) { return connectInternal( address, clazz, (ActorRef actorRef) -> { Tuple2<String, String> addressHostname = extractAddressHostname(actorRef); return new FencedAkkaInvocationHandler<>( addressHostname.f0, addressHostname.f1, actorRef, timeout, maximumFramesize, null, () -> fencingToken); }); } @Override public <C extends RpcEndpoint & RpcGateway> RpcServer startServer(C rpcEndpoint) { checkNotNull(rpcEndpoint, “rpc endpoint”); CompletableFuture<Void> terminationFuture = new CompletableFuture<>(); final Props akkaRpcActorProps; if (rpcEndpoint instanceof FencedRpcEndpoint) { akkaRpcActorProps = Props.create(FencedAkkaRpcActor.class, rpcEndpoint, terminationFuture, getVersion()); } else { akkaRpcActorProps = Props.create(AkkaRpcActor.class, rpcEndpoint, terminationFuture, getVersion()); } ActorRef actorRef; synchronized (lock) { checkState(!stopped, “RpcService is stopped”); actorRef = actorSystem.actorOf(akkaRpcActorProps, rpcEndpoint.getEndpointId()); actors.put(actorRef, rpcEndpoint); } LOG.info(“Starting RPC endpoint for {} at {} .”, rpcEndpoint.getClass().getName(), actorRef.path()); final String akkaAddress = AkkaUtils.getAkkaURL(actorSystem, actorRef); final String hostname; Option<String> host = actorRef.path().address().host(); if (host.isEmpty()) { hostname = “localhost”; } else { hostname = host.get(); } Set<Class<?>> implementedRpcGateways = new HashSet<>(RpcUtils.extractImplementedRpcGateways(rpcEndpoint.getClass())); implementedRpcGateways.add(RpcServer.class); implementedRpcGateways.add(AkkaBasedEndpoint.class); final InvocationHandler akkaInvocationHandler; if (rpcEndpoint instanceof FencedRpcEndpoint) { // a FencedRpcEndpoint needs a FencedAkkaInvocationHandler akkaInvocationHandler = new FencedAkkaInvocationHandler<>( akkaAddress, hostname, actorRef, timeout, maximumFramesize, terminationFuture, ((FencedRpcEndpoint<?>) rpcEndpoint)::getFencingToken); implementedRpcGateways.add(FencedMainThreadExecutable.class); } else { akkaInvocationHandler = new AkkaInvocationHandler( akkaAddress, hostname, actorRef, timeout, maximumFramesize, terminationFuture); } // Rather than using the System ClassLoader directly, we derive the ClassLoader // from this class . That works better in cases where Flink runs embedded and all Flink // code is loaded dynamically (for example from an OSGI bundle) through a custom ClassLoader ClassLoader classLoader = getClass().getClassLoader(); @SuppressWarnings(“unchecked”) RpcServer server = (RpcServer) Proxy.newProxyInstance( classLoader, implementedRpcGateways.toArray(new Class<?>[implementedRpcGateways.size()]), akkaInvocationHandler); return server; } @Override public <F extends Serializable> RpcServer fenceRpcServer(RpcServer rpcServer, F fencingToken) { if (rpcServer instanceof AkkaBasedEndpoint) { InvocationHandler fencedInvocationHandler = new FencedAkkaInvocationHandler<>( rpcServer.getAddress(), rpcServer.getHostname(), ((AkkaBasedEndpoint) rpcServer).getActorRef(), timeout, maximumFramesize, null, () -> fencingToken); // Rather than using the System ClassLoader directly, we derive the ClassLoader // from this class . That works better in cases where Flink runs embedded and all Flink // code is loaded dynamically (for example from an OSGI bundle) through a custom ClassLoader ClassLoader classLoader = getClass().getClassLoader(); return (RpcServer) Proxy.newProxyInstance( classLoader, new Class<?>[]{RpcServer.class, AkkaBasedEndpoint.class}, fencedInvocationHandler); } else { throw new RuntimeException(“The given RpcServer must implement the AkkaGateway in order to fence it.”); } } @Override public void stopServer(RpcServer selfGateway) { if (selfGateway instanceof AkkaBasedEndpoint) { final AkkaBasedEndpoint akkaClient = (AkkaBasedEndpoint) selfGateway; final RpcEndpoint rpcEndpoint; synchronized (lock) { if (stopped) { return; } else { rpcEndpoint = actors.remove(akkaClient.getActorRef()); } } if (rpcEndpoint != null) { akkaClient.getActorRef().tell(PoisonPill.getInstance(), ActorRef.noSender()); } else { LOG.debug(“RPC endpoint {} already stopped or from different RPC service”, selfGateway.getAddress()); } } } @Override public CompletableFuture<Void> stopService() { synchronized (lock) { if (stopped) { return terminationFuture; } stopped = true; } LOG.info(“Stopping Akka RPC service.”); final CompletableFuture<Terminated> actorSystemTerminationFuture = FutureUtils.toJava(actorSystem.terminate()); actorSystemTerminationFuture.whenComplete( (Terminated ignored, Throwable throwable) -> { synchronized (lock) { actors.clear(); } if (throwable != null) { terminationFuture.completeExceptionally(throwable); } else { terminationFuture.complete(null); } LOG.info(“Stopped Akka RPC service.”); }); return terminationFuture; } @Override public CompletableFuture<Void> getTerminationFuture() { return terminationFuture; } @Override public Executor getExecutor() { return actorSystem.dispatcher(); } @Override public ScheduledExecutor getScheduledExecutor() { return internalScheduledExecutor; } @Override public ScheduledFuture<?> scheduleRunnable(Runnable runnable, long delay, TimeUnit unit) { checkNotNull(runnable, “runnable”); checkNotNull(unit, “unit”); checkArgument(delay >= 0L, “delay must be zero or larger”); return internalScheduledExecutor.schedule(runnable, delay, unit); } @Override public void execute(Runnable runnable) { actorSystem.dispatcher().execute(runnable); } @Override public <T> CompletableFuture<T> execute(Callable<T> callable) { Future<T> scalaFuture = Futures.<T>future(callable, actorSystem.dispatcher()); return FutureUtils.toJava(scalaFuture); } private <C extends RpcGateway> CompletableFuture<C> connectInternal( final String address, final Class<C> clazz, Function<ActorRef, InvocationHandler> invocationHandlerFactory) { checkState(!stopped, “RpcService is stopped”); LOG.debug(“Try to connect to remote RPC endpoint with address {}. Returning a {} gateway.”, address, clazz.getName()); final ActorSelection actorSel = actorSystem.actorSelection(address); final Future<ActorIdentity> identify = Patterns .ask(actorSel, new Identify(42), timeout.toMilliseconds()) .<ActorIdentity>mapTo(ClassTag$.MODULE$.<ActorIdentity>apply(ActorIdentity.class)); final CompletableFuture<ActorIdentity> identifyFuture = FutureUtils.toJava(identify); final CompletableFuture<ActorRef> actorRefFuture = identifyFuture.thenApply( (ActorIdentity actorIdentity) -> { if (actorIdentity.getRef() == null) { throw new CompletionException(new RpcConnectionException(“Could not connect to rpc endpoint under address " + address + ‘.’)); } else { return actorIdentity.getRef(); } }); final CompletableFuture<HandshakeSuccessMessage> handshakeFuture = actorRefFuture.thenCompose( (ActorRef actorRef) -> FutureUtils.toJava( Patterns .ask(actorRef, new RemoteHandshakeMessage(clazz, getVersion()), timeout.toMilliseconds()) .<HandshakeSuccessMessage>mapTo(ClassTag$.MODULE$.<HandshakeSuccessMessage>apply(HandshakeSuccessMessage.class)))); return actorRefFuture.thenCombineAsync( handshakeFuture, (ActorRef actorRef, HandshakeSuccessMessage ignored) -> { InvocationHandler invocationHandler = invocationHandlerFactory.apply(actorRef); // Rather than using the System ClassLoader directly, we derive the ClassLoader // from this class . That works better in cases where Flink runs embedded and all Flink // code is loaded dynamically (for example from an OSGI bundle) through a custom ClassLoader ClassLoader classLoader = getClass().getClassLoader(); @SuppressWarnings(“unchecked”) C proxy = (C) Proxy.newProxyInstance( classLoader, new Class<?>[]{clazz}, invocationHandler); return proxy; }, actorSystem.dispatcher()); } //……}AkkaRpcService实现了RpcService接口，其构造器要求传入actorSystem及timeout参数；connect方法会创建一个AkkaInvocationHandler或者FencedAkkaInvocationHandler，然后调用connectInternal方法使用akka进行连接startServer方法会利用actorSystem创建ActorRef，然后创建AkkaInvocationHandler或者FencedAkkaInvocationHandler，最后使用Proxy.newProxyInstance创建RpcServer；stopServer方法会使用PoisonPill来终止actor；stopService用于终止当前的RpcService，它会执行actorSystem.terminate()fenceRpcServer方法用于根据指定的fencingToken重新使用代理创建新的RpcServer；execute方法使用的是actorSystem.dispatcher()来调度执行；scheduleRunnable方法则使用的是ActorSystemScheduledExecutorAdapter来进行调度小结RpcService用于连接到一个远程的rpc server，或者启动一个rpc server来转发远程调用到rpcEndpoint；它提供了connect、startServer、fenceRpcServer、stopServer、stopService、getTerminationFuture、scheduleRunnable、execute等方法AkkaRpcService实现了RpcService接口，它的connect方法会创建一个AkkaInvocationHandler或者FencedAkkaInvocationHandler，然后调用connectInternal方法使用akka进行连接AkkaRpcService的startServer方法会利用actorSystem创建ActorRef，然后创建AkkaInvocationHandler或者FencedAkkaInvocationHandler，最后使用Proxy.newProxyInstance创建RpcServer；stopServer方法会使用PoisonPill来终止actor；stopService用于终止当前的RpcService，它会执行actorSystem.terminate()docRpcService ...

聊聊flink taskmanager的data.port与rpc.port

序本文主要研究一下flink taskmanager的data.port与rpc.portTaskManagerServicesflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskManagerServices.javapublic class TaskManagerServices { //…… public static TaskManagerServices fromConfiguration( TaskManagerServicesConfiguration taskManagerServicesConfiguration, ResourceID resourceID, Executor taskIOExecutor, long freeHeapMemoryWithDefrag, long maxJvmHeapMemory) throws Exception { // pre-start checks checkTempDirs(taskManagerServicesConfiguration.getTmpDirPaths()); final NetworkEnvironment network = createNetworkEnvironment(taskManagerServicesConfiguration, maxJvmHeapMemory); network.start(); final TaskManagerLocation taskManagerLocation = new TaskManagerLocation( resourceID, taskManagerServicesConfiguration.getTaskManagerAddress(), network.getConnectionManager().getDataPort()); // this call has to happen strictly after the network stack has been initialized final MemoryManager memoryManager = createMemoryManager(taskManagerServicesConfiguration, freeHeapMemoryWithDefrag, maxJvmHeapMemory); // start the I/O manager, it will create some temp directories. final IOManager ioManager = new IOManagerAsync(taskManagerServicesConfiguration.getTmpDirPaths()); final BroadcastVariableManager broadcastVariableManager = new BroadcastVariableManager(); final List<ResourceProfile> resourceProfiles = new ArrayList<>(taskManagerServicesConfiguration.getNumberOfSlots()); for (int i = 0; i < taskManagerServicesConfiguration.getNumberOfSlots(); i++) { resourceProfiles.add(ResourceProfile.ANY); } final TimerService<AllocationID> timerService = new TimerService<>( new ScheduledThreadPoolExecutor(1), taskManagerServicesConfiguration.getTimerServiceShutdownTimeout()); final TaskSlotTable taskSlotTable = new TaskSlotTable(resourceProfiles, timerService); final JobManagerTable jobManagerTable = new JobManagerTable(); final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation); final String[] stateRootDirectoryStrings = taskManagerServicesConfiguration.getLocalRecoveryStateRootDirectories(); final File[] stateRootDirectoryFiles = new File[stateRootDirectoryStrings.length]; for (int i = 0; i < stateRootDirectoryStrings.length; ++i) { stateRootDirectoryFiles[i] = new File(stateRootDirectoryStrings[i], LOCAL_STATE_SUB_DIRECTORY_ROOT); } final TaskExecutorLocalStateStoresManager taskStateManager = new TaskExecutorLocalStateStoresManager( taskManagerServicesConfiguration.isLocalRecoveryEnabled(), stateRootDirectoryFiles, taskIOExecutor); return new TaskManagerServices( taskManagerLocation, memoryManager, ioManager, network, broadcastVariableManager, taskSlotTable, jobManagerTable, jobLeaderService, taskStateManager); } private static NetworkEnvironment createNetworkEnvironment( TaskManagerServicesConfiguration taskManagerServicesConfiguration, long maxJvmHeapMemory) { NetworkEnvironmentConfiguration networkEnvironmentConfiguration = taskManagerServicesConfiguration.getNetworkConfig(); final long networkBuf = calculateNetworkBufferMemory(taskManagerServicesConfiguration, maxJvmHeapMemory); int segmentSize = networkEnvironmentConfiguration.networkBufferSize(); // tolerate offcuts between intended and allocated memory due to segmentation (will be available to the user-space memory) final long numNetBuffersLong = networkBuf / segmentSize; if (numNetBuffersLong > Integer.MAX_VALUE) { throw new IllegalArgumentException(“The given number of memory bytes (” + networkBuf + “) corresponds to more than MAX_INT pages.”); } NetworkBufferPool networkBufferPool = new NetworkBufferPool( (int) numNetBuffersLong, segmentSize); ConnectionManager connectionManager; boolean enableCreditBased = false; NettyConfig nettyConfig = networkEnvironmentConfiguration.nettyConfig(); if (nettyConfig != null) { connectionManager = new NettyConnectionManager(nettyConfig); enableCreditBased = nettyConfig.isCreditBasedEnabled(); } else { connectionManager = new LocalConnectionManager(); } ResultPartitionManager resultPartitionManager = new ResultPartitionManager(); TaskEventDispatcher taskEventDispatcher = new TaskEventDispatcher(); KvStateRegistry kvStateRegistry = new KvStateRegistry(); QueryableStateConfiguration qsConfig = taskManagerServicesConfiguration.getQueryableStateConfig(); int numProxyServerNetworkThreads = qsConfig.numProxyServerThreads() == 0 ? taskManagerServicesConfiguration.getNumberOfSlots() : qsConfig.numProxyServerThreads(); int numProxyServerQueryThreads = qsConfig.numProxyQueryThreads() == 0 ? taskManagerServicesConfiguration.getNumberOfSlots() : qsConfig.numProxyQueryThreads(); final KvStateClientProxy kvClientProxy = QueryableStateUtils.createKvStateClientProxy( taskManagerServicesConfiguration.getTaskManagerAddress(), qsConfig.getProxyPortRange(), numProxyServerNetworkThreads, numProxyServerQueryThreads, new DisabledKvStateRequestStats()); int numStateServerNetworkThreads = qsConfig.numStateServerThreads() == 0 ? taskManagerServicesConfiguration.getNumberOfSlots() : qsConfig.numStateServerThreads(); int numStateServerQueryThreads = qsConfig.numStateQueryThreads() == 0 ? taskManagerServicesConfiguration.getNumberOfSlots() : qsConfig.numStateQueryThreads(); final KvStateServer kvStateServer = QueryableStateUtils.createKvStateServer( taskManagerServicesConfiguration.getTaskManagerAddress(), qsConfig.getStateServerPortRange(), numStateServerNetworkThreads, numStateServerQueryThreads, kvStateRegistry, new DisabledKvStateRequestStats()); // we start the network first, to make sure it can allocate its buffers first return new NetworkEnvironment( networkBufferPool, connectionManager, resultPartitionManager, taskEventDispatcher, kvStateRegistry, kvStateServer, kvClientProxy, networkEnvironmentConfiguration.ioMode(), networkEnvironmentConfiguration.partitionRequestInitialBackoff(), networkEnvironmentConfiguration.partitionRequestMaxBackoff(), networkEnvironmentConfiguration.networkBuffersPerChannel(), networkEnvironmentConfiguration.floatingNetworkBuffersPerGate(), enableCreditBased); } //……}TaskManagerServices的fromConfiguration方法从taskManagerServicesConfiguration读取配置，然后创建NetworkEnvironment，之后创建TaskManagerLocation用到了NetworkEnvironment.getConnectionManager().getDataPort()TaskExecutorToResourceManagerConnection及ConnectionID均从TaskManagerLocation获取了dataPort信息createNetworkEnvironment方法从taskManagerServicesConfiguration获取NetworkEnvironmentConfiguration(它从配置文件读取taskmanager.data.port)，如果它的nettyConfig不为null，则根据它创建了NettyConnectionManagerNettyConnectionManagerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/netty/NettyConnectionManager.javapublic class NettyConnectionManager implements ConnectionManager { private final NettyServer server; private final NettyClient client; private final NettyBufferPool bufferPool; private final PartitionRequestClientFactory partitionRequestClientFactory; public NettyConnectionManager(NettyConfig nettyConfig) { this.server = new NettyServer(nettyConfig); this.client = new NettyClient(nettyConfig); this.bufferPool = new NettyBufferPool(nettyConfig.getNumberOfArenas()); this.partitionRequestClientFactory = new PartitionRequestClientFactory(client); } @Override public void start(ResultPartitionProvider partitionProvider, TaskEventDispatcher taskEventDispatcher) throws IOException { NettyProtocol partitionRequestProtocol = new NettyProtocol( partitionProvider, taskEventDispatcher, client.getConfig().isCreditBasedEnabled()); client.init(partitionRequestProtocol, bufferPool); server.init(partitionRequestProtocol, bufferPool); } @Override public PartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) throws IOException, InterruptedException { return partitionRequestClientFactory.createPartitionRequestClient(connectionId); } @Override public void closeOpenChannelConnections(ConnectionID connectionId) { partitionRequestClientFactory.closeOpenChannelConnections(connectionId); } @Override public int getNumberOfActiveConnections() { return partitionRequestClientFactory.getNumberOfActiveClients(); } @Override public int getDataPort() { if (server != null && server.getLocalAddress() != null) { return server.getLocalAddress().getPort(); } else { return -1; } } @Override public void shutdown() { client.shutdown(); server.shutdown(); } NettyClient getClient() { return client; } NettyServer getServer() { return server; } NettyBufferPool getBufferPool() { return bufferPool; }}NettyConnectionManager的构造器根据NettyConfig构造了NettyServer，而getDataPort则取的是server.getLocalAddress().getPort()TaskManagerRunnerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskManagerRunner.javapublic class TaskManagerRunner implements FatalErrorHandler, AutoCloseableAsync { //…… public static RpcService createRpcService( final Configuration configuration, final HighAvailabilityServices haServices) throws Exception { checkNotNull(configuration); checkNotNull(haServices); String taskManagerHostname = configuration.getString(TaskManagerOptions.HOST); if (taskManagerHostname != null) { LOG.info(“Using configured hostname/address for TaskManager: {}.”, taskManagerHostname); } else { Time lookupTimeout = Time.milliseconds(AkkaUtils.getLookupTimeout(configuration).toMillis()); InetAddress taskManagerAddress = LeaderRetrievalUtils.findConnectingAddress( haServices.getResourceManagerLeaderRetriever(), lookupTimeout); taskManagerHostname = taskManagerAddress.getHostName(); LOG.info(“TaskManager will use hostname/address ‘{}’ ({}) for communication.”, taskManagerHostname, taskManagerAddress.getHostAddress()); } final String portRangeDefinition = configuration.getString(TaskManagerOptions.RPC_PORT); return AkkaRpcServiceUtils.createRpcService(taskManagerHostname, portRangeDefinition, configuration); } //……}TaskManagerRunner提供了createRpcService方法，其从配置文件读取taskmanager.rpc.port，然后调用AkkaRpcServiceUtils.createRpcService来创建RpcService小结TaskManagerServices的fromConfiguration方法从taskManagerServicesConfiguration读取配置，然后创建NetworkEnvironment，之后创建TaskManagerLocation用到了NetworkEnvironment.getConnectionManager().getDataPort()；TaskExecutorToResourceManagerConnection及ConnectionID均从TaskManagerLocation获取了dataPort信息TaskManagerServices的createNetworkEnvironment方法从taskManagerServicesConfiguration获取NetworkEnvironmentConfiguration(它从配置文件读取taskmanager.data.port)，如果它的nettyConfig不为null，则根据它创建了NettyConnectionManager；NettyConnectionManager的构造器根据NettyConfig构造了NettyServer，而getDataPort则取的是server.getLocalAddress().getPort()TaskManagerRunner提供了createRpcService方法，其从配置文件读取taskmanager.rpc.port，然后调用AkkaRpcServiceUtils.createRpcService来创建RpcServicedoctaskmanager-data-porttaskmanager-rpc-port ...

聊聊flink的HistoryServer

序本文主要研究一下flink的HistoryServerHistoryServerflink-1.7.2/flink-runtime-web/src/main/java/org/apache/flink/runtime/webmonitor/history/HistoryServer.javapublic class HistoryServer { private static final Logger LOG = LoggerFactory.getLogger(HistoryServer.class); private final Configuration config; private final String webAddress; private final int webPort; private final long webRefreshIntervalMillis; private final File webDir; private final HistoryServerArchiveFetcher archiveFetcher; @Nullable private final SSLHandlerFactory serverSSLFactory; private WebFrontendBootstrap netty; private final Object startupShutdownLock = new Object(); private final AtomicBoolean shutdownRequested = new AtomicBoolean(false); private final Thread shutdownHook; public static void main(String[] args) throws Exception { ParameterTool pt = ParameterTool.fromArgs(args); String configDir = pt.getRequired(“configDir”); LOG.info(“Loading configuration from {}”, configDir); final Configuration flinkConfig = GlobalConfiguration.loadConfiguration(configDir); try { FileSystem.initialize(flinkConfig); } catch (IOException e) { throw new Exception(“Error while setting the default filesystem scheme from configuration.”, e); } // run the history server SecurityUtils.install(new SecurityConfiguration(flinkConfig)); try { SecurityUtils.getInstalledContext().runSecured(new Callable<Integer>() { @Override public Integer call() throws Exception { HistoryServer hs = new HistoryServer(flinkConfig); hs.run(); return 0; } }); System.exit(0); } catch (Throwable t) { final Throwable strippedThrowable = ExceptionUtils.stripException(t, UndeclaredThrowableException.class); LOG.error(“Failed to run HistoryServer.”, strippedThrowable); strippedThrowable.printStackTrace(); System.exit(1); } } public HistoryServer(Configuration config) throws IOException, FlinkException { this(config, new CountDownLatch(0)); } public HistoryServer(Configuration config, CountDownLatch numFinishedPolls) throws IOException, FlinkException { Preconditions.checkNotNull(config); Preconditions.checkNotNull(numFinishedPolls); this.config = config; if (config.getBoolean(HistoryServerOptions.HISTORY_SERVER_WEB_SSL_ENABLED) && SSLUtils.isRestSSLEnabled(config)) { LOG.info(“Enabling SSL for the history server.”); try { this.serverSSLFactory = SSLUtils.createRestServerSSLEngineFactory(config); } catch (Exception e) { throw new IOException(“Failed to initialize SSLContext for the history server.”, e); } } else { this.serverSSLFactory = null; } webAddress = config.getString(HistoryServerOptions.HISTORY_SERVER_WEB_ADDRESS); webPort = config.getInteger(HistoryServerOptions.HISTORY_SERVER_WEB_PORT); webRefreshIntervalMillis = config.getLong(HistoryServerOptions.HISTORY_SERVER_WEB_REFRESH_INTERVAL); String webDirectory = config.getString(HistoryServerOptions.HISTORY_SERVER_WEB_DIR); if (webDirectory == null) { webDirectory = System.getProperty(“java.io.tmpdir”) + File.separator + “flink-web-history-” + UUID.randomUUID(); } webDir = new File(webDirectory); String refreshDirectories = config.getString(HistoryServerOptions.HISTORY_SERVER_ARCHIVE_DIRS); if (refreshDirectories == null) { throw new FlinkException(HistoryServerOptions.HISTORY_SERVER_ARCHIVE_DIRS + " was not configured."); } List<RefreshLocation> refreshDirs = new ArrayList<>(); for (String refreshDirectory : refreshDirectories.split(",")) { try { Path refreshPath = WebMonitorUtils.validateAndNormalizeUri(new Path(refreshDirectory).toUri()); FileSystem refreshFS = refreshPath.getFileSystem(); refreshDirs.add(new RefreshLocation(refreshPath, refreshFS)); } catch (Exception e) { // there’s most likely something wrong with the path itself, so we ignore it from here on LOG.warn(“Failed to create Path or FileSystem for directory ‘{}’. Directory will not be monitored.”, refreshDirectory, e); } } if (refreshDirs.isEmpty()) { throw new FlinkException(“Failed to validate any of the configured directories to monitor.”); } long refreshIntervalMillis = config.getLong(HistoryServerOptions.HISTORY_SERVER_ARCHIVE_REFRESH_INTERVAL); archiveFetcher = new HistoryServerArchiveFetcher(refreshIntervalMillis, refreshDirs, webDir, numFinishedPolls); this.shutdownHook = ShutdownHookUtil.addShutdownHook( HistoryServer.this::stop, HistoryServer.class.getSimpleName(), LOG); } @VisibleForTesting int getWebPort() { return netty.getServerPort(); } public void run() { try { start(); new CountDownLatch(1).await(); } catch (Exception e) { LOG.error(“Failure while running HistoryServer.”, e); } finally { stop(); } } // ———————————————————————— // Life-cycle // ———————————————————————— void start() throws IOException, InterruptedException { synchronized (startupShutdownLock) { LOG.info(“Starting history server.”); Files.createDirectories(webDir.toPath()); LOG.info(“Using directory {} as local cache.”, webDir); Router router = new Router(); router.addGet("/:*", new HistoryServerStaticFileServerHandler(webDir)); if (!webDir.exists() && !webDir.mkdirs()) { throw new IOException(“Failed to create local directory " + webDir.getAbsoluteFile() + “.”); } createDashboardConfigFile(); archiveFetcher.start(); netty = new WebFrontendBootstrap(router, LOG, webDir, serverSSLFactory, webAddress, webPort, config); } } void stop() { if (shutdownRequested.compareAndSet(false, true)) { synchronized (startupShutdownLock) { LOG.info(“Stopping history server.”); try { netty.shutdown(); } catch (Throwable t) { LOG.warn(“Error while shutting down WebFrontendBootstrap.”, t); } archiveFetcher.stop(); try { LOG.info(“Removing web dashboard root cache directory {}”, webDir); FileUtils.deleteDirectory(webDir); } catch (Throwable t) { LOG.warn(“Error while deleting web root directory {}”, webDir, t); } LOG.info(“Stopped history server.”); // Remove shutdown hook to prevent resource leaks ShutdownHookUtil.removeShutdownHook(shutdownHook, getClass().getSimpleName(), LOG); } } } // ———————————————————————— // File generation // ———————————————————————— static FileWriter createOrGetFile(File folder, String name) throws IOException { File file = new File(folder, name + “.json”); if (!file.exists()) { Files.createFile(file.toPath()); } FileWriter fr = new FileWriter(file); return fr; } private void createDashboardConfigFile() throws IOException { try (FileWriter fw = createOrGetFile(webDir, “config”)) { fw.write(createConfigJson(DashboardConfiguration.from(webRefreshIntervalMillis, ZonedDateTime.now()))); fw.flush(); } catch (IOException ioe) { LOG.error(“Failed to write config file.”); throw ioe; } } private static String createConfigJson(DashboardConfiguration dashboardConfiguration) throws IOException { StringWriter writer = new StringWriter(); JsonGenerator gen = JsonFactory.JACKSON_FACTORY.createGenerator(writer); gen.writeStartObject(); gen.writeNumberField(DashboardConfiguration.FIELD_NAME_REFRESH_INTERVAL, dashboardConfiguration.getRefreshInterval()); gen.writeNumberField(DashboardConfiguration.FIELD_NAME_TIMEZONE_OFFSET, dashboardConfiguration.getTimeZoneOffset()); gen.writeStringField(DashboardConfiguration.FIELD_NAME_TIMEZONE_NAME, dashboardConfiguration.getTimeZoneName()); gen.writeStringField(DashboardConfiguration.FIELD_NAME_FLINK_VERSION, dashboardConfiguration.getFlinkVersion()); gen.writeStringField(DashboardConfiguration.FIELD_NAME_FLINK_REVISION, dashboardConfiguration.getFlinkRevision()); gen.writeEndObject(); gen.close(); return writer.toString(); } /** * Container for the {@link Path} and {@link FileSystem} of a refresh directory. / static class RefreshLocation { private final Path path; private final FileSystem fs; private RefreshLocation(Path path, FileSystem fs) { this.path = path; this.fs = fs; } public Path getPath() { return path; } public FileSystem getFs() { return fs; } }}HistoryServer提供了finished jobs的相关查询功能；构造器从配置中读取historyserver.web.address、historyserver.web.port(默认8082)、historyserver.web.refresh-interval(默认10秒)、historyserver.web.tmpdir、historyserver.archive.fs.dir、historyserver.archive.fs.refresh-interval(默认10秒)，然后创建了HistoryServerArchiveFetcher其run方法主要是调用start方法，该方法主要是启动HistoryServerArchiveFetcher，然后创建WebFrontendBootstrap构造器使用ShutdownHookUtil.addShutdownHook注册了ShutdownHook，在shutdown时执行stop方法，stop方法主要是调用WebFrontendBootstrap的shutdown方法以及HistoryServerArchiveFetcher的stop方法，然后清理webDir，移除shutdownHookHistoryServerArchiveFetcherflink-1.7.2/flink-runtime-web/src/main/java/org/apache/flink/runtime/webmonitor/history/HistoryServerArchiveFetcher.javaclass HistoryServerArchiveFetcher { private static final Logger LOG = LoggerFactory.getLogger(HistoryServerArchiveFetcher.class); private static final JsonFactory jacksonFactory = new JsonFactory(); private static final ObjectMapper mapper = new ObjectMapper(); private final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor( new ExecutorThreadFactory(“Flink-HistoryServer-ArchiveFetcher”)); private final JobArchiveFetcherTask fetcherTask; private final long refreshIntervalMillis; HistoryServerArchiveFetcher(long refreshIntervalMillis, List<HistoryServer.RefreshLocation> refreshDirs, File webDir, CountDownLatch numFinishedPolls) { this.refreshIntervalMillis = refreshIntervalMillis; this.fetcherTask = new JobArchiveFetcherTask(refreshDirs, webDir, numFinishedPolls); if (LOG.isInfoEnabled()) { for (HistoryServer.RefreshLocation refreshDir : refreshDirs) { LOG.info(“Monitoring directory {} for archived jobs.”, refreshDir.getPath()); } } } void start() { executor.scheduleWithFixedDelay(fetcherTask, 0, refreshIntervalMillis, TimeUnit.MILLISECONDS); } void stop() { executor.shutdown(); try { if (!executor.awaitTermination(1, TimeUnit.SECONDS)) { executor.shutdownNow(); } } catch (InterruptedException ignored) { executor.shutdownNow(); } } /* * {@link TimerTask} that polls the directories configured as {@link HistoryServerOptions#HISTORY_SERVER_ARCHIVE_DIRS} for * new job archives. / static class JobArchiveFetcherTask extends TimerTask { private final List<HistoryServer.RefreshLocation> refreshDirs; private final CountDownLatch numFinishedPolls; /* Cache of all available jobs identified by their id. / private final Set<String> cachedArchives; private final File webDir; private final File webJobDir; private final File webOverviewDir; private static final String JSON_FILE_ENDING = “.json”; JobArchiveFetcherTask(List<HistoryServer.RefreshLocation> refreshDirs, File webDir, CountDownLatch numFinishedPolls) { this.refreshDirs = checkNotNull(refreshDirs); this.numFinishedPolls = numFinishedPolls; this.cachedArchives = new HashSet<>(); this.webDir = checkNotNull(webDir); this.webJobDir = new File(webDir, “jobs”); webJobDir.mkdir(); this.webOverviewDir = new File(webDir, “overviews”); webOverviewDir.mkdir(); } @Override public void run() { try { for (HistoryServer.RefreshLocation refreshLocation : refreshDirs) { Path refreshDir = refreshLocation.getPath(); FileSystem refreshFS = refreshLocation.getFs(); // contents of /:refreshDir FileStatus[] jobArchives; try { jobArchives = refreshFS.listStatus(refreshDir); } catch (IOException e) { LOG.error(“Failed to access job archive location for path {}.”, refreshDir, e); continue; } if (jobArchives == null) { continue; } boolean updateOverview = false; for (FileStatus jobArchive : jobArchives) { Path jobArchivePath = jobArchive.getPath(); String jobID = jobArchivePath.getName(); try { JobID.fromHexString(jobID); } catch (IllegalArgumentException iae) { LOG.debug(“Archive directory {} contained file with unexpected name {}. Ignoring file.”, refreshDir, jobID, iae); continue; } if (cachedArchives.add(jobID)) { try { for (ArchivedJson archive : FsJobArchivist.getArchivedJsons(jobArchive.getPath())) { String path = archive.getPath(); String json = archive.getJson(); File target; if (path.equals(JobsOverviewHeaders.URL)) { target = new File(webOverviewDir, jobID + JSON_FILE_ENDING); } else if (path.equals("/joboverview”)) { // legacy path json = convertLegacyJobOverview(json); target = new File(webOverviewDir, jobID + JSON_FILE_ENDING); } else { target = new File(webDir, path + JSON_FILE_ENDING); } java.nio.file.Path parent = target.getParentFile().toPath(); try { Files.createDirectories(parent); } catch (FileAlreadyExistsException ignored) { // there may be left-over directories from the previous attempt } java.nio.file.Path targetPath = target.toPath(); // We overwrite existing files since this may be another attempt at fetching this archive. // Existing files may be incomplete/corrupt. Files.deleteIfExists(targetPath); Files.createFile(target.toPath()); try (FileWriter fw = new FileWriter(target)) { fw.write(json); fw.flush(); } } updateOverview = true; } catch (IOException e) { LOG.error(“Failure while fetching/processing job archive for job {}.”, jobID, e); // Make sure we attempt to fetch the archive again cachedArchives.remove(jobID); // Make sure we do not include this job in the overview try { Files.delete(new File(webOverviewDir, jobID + JSON_FILE_ENDING).toPath()); } catch (IOException ioe) { LOG.debug(“Could not delete file from overview directory.”, ioe); } // Clean up job files we may have created File jobDirectory = new File(webJobDir, jobID); try { FileUtils.deleteDirectory(jobDirectory); } catch (IOException ioe) { LOG.debug(“Could not clean up job directory.”, ioe); } } } } if (updateOverview) { updateJobOverview(webOverviewDir, webDir); } } } catch (Exception e) { LOG.error(“Critical failure while fetching/processing job archives.”, e); } numFinishedPolls.countDown(); } } private static String convertLegacyJobOverview(String legacyOverview) throws IOException { JsonNode root = mapper.readTree(legacyOverview); JsonNode finishedJobs = root.get(“finished”); JsonNode job = finishedJobs.get(0); JobID jobId = JobID.fromHexString(job.get(“jid”).asText()); String name = job.get(“name”).asText(); JobStatus state = JobStatus.valueOf(job.get(“state”).asText()); long startTime = job.get(“start-time”).asLong(); long endTime = job.get(“end-time”).asLong(); long duration = job.get(“duration”).asLong(); long lastMod = job.get(“last-modification”).asLong(); JsonNode tasks = job.get(“tasks”); int numTasks = tasks.get(“total”).asInt(); int pending = tasks.get(“pending”).asInt(); int running = tasks.get(“running”).asInt(); int finished = tasks.get(“finished”).asInt(); int canceling = tasks.get(“canceling”).asInt(); int canceled = tasks.get(“canceled”).asInt(); int failed = tasks.get(“failed”).asInt(); int[] tasksPerState = new int[ExecutionState.values().length]; // pending is a mix of CREATED/SCHEDULED/DEPLOYING // to maintain the correct number of task states we have to pick one of them tasksPerState[ExecutionState.SCHEDULED.ordinal()] = pending; tasksPerState[ExecutionState.RUNNING.ordinal()] = running; tasksPerState[ExecutionState.FINISHED.ordinal()] = finished; tasksPerState[ExecutionState.CANCELING.ordinal()] = canceling; tasksPerState[ExecutionState.CANCELED.ordinal()] = canceled; tasksPerState[ExecutionState.FAILED.ordinal()] = failed; JobDetails jobDetails = new JobDetails(jobId, name, startTime, endTime, duration, state, lastMod, tasksPerState, numTasks); MultipleJobsDetails multipleJobsDetails = new MultipleJobsDetails(Collections.singleton(jobDetails)); StringWriter sw = new StringWriter(); mapper.writeValue(sw, multipleJobsDetails); return sw.toString(); } /* * This method replicates the JSON response that would be given by the JobsOverviewHandler when * listing both running and finished jobs. * * Every job archive contains a joboverview.json file containing the same structure. Since jobs are archived on * their own however the list of finished jobs only contains a single job. * * For the display in the HistoryServer WebFrontend we have to combine these overviews. */ private static void updateJobOverview(File webOverviewDir, File webDir) { try (JsonGenerator gen = jacksonFactory.createGenerator(HistoryServer.createOrGetFile(webDir, JobsOverviewHeaders.URL))) { File[] overviews = new File(webOverviewDir.getPath()).listFiles(); if (overviews != null) { Collection<JobDetails> allJobs = new ArrayList<>(overviews.length); for (File overview : overviews) { MultipleJobsDetails subJobs = mapper.readValue(overview, MultipleJobsDetails.class); allJobs.addAll(subJobs.getJobs()); } mapper.writeValue(gen, new MultipleJobsDetails(allJobs)); } } catch (IOException ioe) { LOG.error(“Failed to update job overview.”, ioe); } }}HistoryServerArchiveFetcher主要是以historyserver.archive.fs.refresh-interval的时间间隔从historyserver.archive.fs.dir目录拉取job archives；它内部创建了JobArchiveFetcherTask来执行这个任务JobArchiveFetcherTask继承了jdk的TimerTask，其run方法就是遍历refreshDirs，然后执行FileSystem.listStatus，然后使用FsJobArchivist.getArchivedJsons获取ArchivedJson根据不同path写入到指定文件如果path是/jobs/overview，则写入webDir/overviews/jobID.json文件；如果path是/joboverview，则先调用convertLegacyJobOverview转换json，然后再写入webDir/overviews/jobID.json文件；其他的path则写入webDir/path.json文件WebFrontendBootstrapflink-1.7.2/flink-runtime-web/src/main/java/org/apache/flink/runtime/webmonitor/utils/WebFrontendBootstrap.javapublic class WebFrontendBootstrap { private final Router router; private final Logger log; private final File uploadDir; private final ServerBootstrap bootstrap; private final Channel serverChannel; private final String restAddress; public WebFrontendBootstrap( Router router, Logger log, File directory, @Nullable SSLHandlerFactory serverSSLFactory, String configuredAddress, int configuredPort, final Configuration config) throws InterruptedException, UnknownHostException { this.router = Preconditions.checkNotNull(router); this.log = Preconditions.checkNotNull(log); this.uploadDir = directory; ChannelInitializer<SocketChannel> initializer = new ChannelInitializer<SocketChannel>() { @Override protected void initChannel(SocketChannel ch) { RouterHandler handler = new RouterHandler(WebFrontendBootstrap.this.router, new HashMap<>()); // SSL should be the first handler in the pipeline if (serverSSLFactory != null) { ch.pipeline().addLast(“ssl”, serverSSLFactory.createNettySSLHandler()); } ch.pipeline() .addLast(new HttpServerCodec()) .addLast(new ChunkedWriteHandler()) .addLast(new HttpRequestHandler(uploadDir)) .addLast(handler.getName(), handler) .addLast(new PipelineErrorHandler(WebFrontendBootstrap.this.log)); } }; NioEventLoopGroup bossGroup = new NioEventLoopGroup(1); NioEventLoopGroup workerGroup = new NioEventLoopGroup(); this.bootstrap = new ServerBootstrap(); this.bootstrap .group(bossGroup, workerGroup) .channel(NioServerSocketChannel.class) .childHandler(initializer); ChannelFuture ch; if (configuredAddress == null) { ch = this.bootstrap.bind(configuredPort); } else { ch = this.bootstrap.bind(configuredAddress, configuredPort); } this.serverChannel = ch.sync().channel(); InetSocketAddress bindAddress = (InetSocketAddress) serverChannel.localAddress(); InetAddress inetAddress = bindAddress.getAddress(); final String address; if (inetAddress.isAnyLocalAddress()) { address = config.getString(JobManagerOptions.ADDRESS, InetAddress.getLocalHost().getHostName()); } else { address = inetAddress.getHostAddress(); } int port = bindAddress.getPort(); this.log.info(“Web frontend listening at {}” + ‘:’ + “{}”, address, port); final String protocol = serverSSLFactory != null ? “https://” : “http://”; this.restAddress = protocol + address + ‘:’ + port; } public ServerBootstrap getBootstrap() { return bootstrap; } public int getServerPort() { Channel server = this.serverChannel; if (server != null) { try { return ((InetSocketAddress) server.localAddress()).getPort(); } catch (Exception e) { log.error(“Cannot access local server port”, e); } } return -1; } public String getRestAddress() { return restAddress; } public void shutdown() { if (this.serverChannel != null) { this.serverChannel.close().awaitUninterruptibly(); } if (bootstrap != null) { if (bootstrap.group() != null) { bootstrap.group().shutdownGracefully(); } if (bootstrap.childGroup() != null) { bootstrap.childGroup().shutdownGracefully(); } } }}WebFrontendBootstrap使用netty启动了一个http server，其pipeline有HttpServerCodec、ChunkedWriteHandler、HttpRequestHandler、RouterHandler、PipelineErrorHandler；其中这里的RouterHandler的Router有个GET的route，其使用的是HistoryServerStaticFileServerHandler，用于给HistoryServer提供静态文件服务小结HistoryServer提供了finished jobs的相关查询功能；其主要由HistoryServerArchiveFetcher以及WebFrontendBootstrap两部分组成；其run方法主要是调用start方法，该方法主要是启动HistoryServerArchiveFetcher，然后创建WebFrontendBootstrapHistoryServerArchiveFetcher主要是以historyserver.archive.fs.refresh-interval的时间间隔从historyserver.archive.fs.dir目录拉取job archives；它内部创建了JobArchiveFetcherTask来执行这个任务；JobArchiveFetcherTask继承了jdk的TimerTask，其run方法就是遍历refreshDirs，然后执行FileSystem.listStatus，然后使用FsJobArchivist.getArchivedJsons获取ArchivedJson根据不同path写入到指定文件WebFrontendBootstrap使用netty启动了一个http server，其pipeline有HttpServerCodec、ChunkedWriteHandler、HttpRequestHandler、RouterHandler、PipelineErrorHandler；其中这里的RouterHandler的Router有个GET的route，其使用的是HistoryServerStaticFileServerHandler，用于给HistoryServer提供静态文件服务docHistoryServer ...

聊聊flink的jobstore配置

序本文主要研究一下flink的jobstore配置JobManagerOptionsflink-1.7.2/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java@PublicEvolvingpublic class JobManagerOptions { //…… /** * The job store cache size in bytes which is used to keep completed * jobs in memory. / public static final ConfigOption<Long> JOB_STORE_CACHE_SIZE = key(“jobstore.cache-size”) .defaultValue(50L * 1024L * 1024L) .withDescription(“The job store cache size in bytes which is used to keep completed jobs in memory.”); /* * The time in seconds after which a completed job expires and is purged from the job store. */ public static final ConfigOption<Long> JOB_STORE_EXPIRATION_TIME = key(“jobstore.expiration-time”) .defaultValue(60L * 60L) .withDescription(“The time in seconds after which a completed job expires and is purged from the job store.”); //……}jobstore.cache-size默认是50M；jobstore.expiration-time默认是1小时SessionClusterEntrypointflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/entrypoint/SessionClusterEntrypoint.javapublic abstract class SessionClusterEntrypoint extends ClusterEntrypoint { public SessionClusterEntrypoint(Configuration configuration) { super(configuration); } @Override protected ArchivedExecutionGraphStore createSerializableExecutionGraphStore( Configuration configuration, ScheduledExecutor scheduledExecutor) throws IOException { final File tmpDir = new File(ConfigurationUtils.parseTempDirectories(configuration)[0]); final Time expirationTime = Time.seconds(configuration.getLong(JobManagerOptions.JOB_STORE_EXPIRATION_TIME)); final long maximumCacheSizeBytes = configuration.getLong(JobManagerOptions.JOB_STORE_CACHE_SIZE); return new FileArchivedExecutionGraphStore( tmpDir, expirationTime, maximumCacheSizeBytes, scheduledExecutor, Ticker.systemTicker()); }}SessionClusterEntrypoint的createSerializableExecutionGraphStore方法读取了JobManagerOptions.JOB_STORE_EXPIRATION_TIME及JobManagerOptions.JOB_STORE_CACHE_SIZE配置，然后创建FileArchivedExecutionGraphStoreFileArchivedExecutionGraphStoreflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/dispatcher/FileArchivedExecutionGraphStore.javapublic class FileArchivedExecutionGraphStore implements ArchivedExecutionGraphStore { private static final Logger LOG = LoggerFactory.getLogger(FileArchivedExecutionGraphStore.class); private final File storageDir; private final Cache<JobID, JobDetails> jobDetailsCache; private final LoadingCache<JobID, ArchivedExecutionGraph> archivedExecutionGraphCache; private final ScheduledFuture<?> cleanupFuture; private final Thread shutdownHook; private int numFinishedJobs; private int numFailedJobs; private int numCanceledJobs; public FileArchivedExecutionGraphStore( File rootDir, Time expirationTime, long maximumCacheSizeBytes, ScheduledExecutor scheduledExecutor, Ticker ticker) throws IOException { final File storageDirectory = initExecutionGraphStorageDirectory(rootDir); LOG.info( “Initializing {}: Storage directory {}, expiration time {}, maximum cache size {} bytes.”, FileArchivedExecutionGraphStore.class.getSimpleName(), storageDirectory, expirationTime.toMilliseconds(), maximumCacheSizeBytes); this.storageDir = Preconditions.checkNotNull(storageDirectory); Preconditions.checkArgument( storageDirectory.exists() && storageDirectory.isDirectory(), “The storage directory must exist and be a directory.”); this.jobDetailsCache = CacheBuilder.newBuilder() .expireAfterWrite(expirationTime.toMilliseconds(), TimeUnit.MILLISECONDS) .removalListener( (RemovalListener<JobID, JobDetails>) notification -> deleteExecutionGraphFile(notification.getKey())) .ticker(ticker) .build(); this.archivedExecutionGraphCache = CacheBuilder.newBuilder() .maximumWeight(maximumCacheSizeBytes) .weigher(this::calculateSize) .build(new CacheLoader<JobID, ArchivedExecutionGraph>() { @Override public ArchivedExecutionGraph load(JobID jobId) throws Exception { return loadExecutionGraph(jobId); }}); this.cleanupFuture = scheduledExecutor.scheduleWithFixedDelay( jobDetailsCache::cleanUp, expirationTime.toMilliseconds(), expirationTime.toMilliseconds(), TimeUnit.MILLISECONDS); this.shutdownHook = ShutdownHookUtil.addShutdownHook(this, getClass().getSimpleName(), LOG); this.numFinishedJobs = 0; this.numFailedJobs = 0; this.numCanceledJobs = 0; } @Override public int size() { return Math.toIntExact(jobDetailsCache.size()); } @Override @Nullable public ArchivedExecutionGraph get(JobID jobId) { try { return archivedExecutionGraphCache.get(jobId); } catch (ExecutionException e) { LOG.debug(“Could not load archived execution graph for job id {}.”, jobId, e); return null; } } @Override public void put(ArchivedExecutionGraph archivedExecutionGraph) throws IOException { final JobStatus jobStatus = archivedExecutionGraph.getState(); final JobID jobId = archivedExecutionGraph.getJobID(); final String jobName = archivedExecutionGraph.getJobName(); Preconditions.checkArgument( jobStatus.isGloballyTerminalState(), “The job " + jobName + ‘(’ + jobId + “) is not in a globally terminal state. Instead it is in state " + jobStatus + ‘.’); switch (jobStatus) { case FINISHED: numFinishedJobs++; break; case CANCELED: numCanceledJobs++; break; case FAILED: numFailedJobs++; break; default: throw new IllegalStateException(“The job " + jobName + ‘(’ + jobId + “) should have been in a globally terminal state. " + “Instead it was in state " + jobStatus + ‘.’); } // write the ArchivedExecutionGraph to disk storeArchivedExecutionGraph(archivedExecutionGraph); final JobDetails detailsForJob = WebMonitorUtils.createDetailsForJob(archivedExecutionGraph); jobDetailsCache.put(jobId, detailsForJob); archivedExecutionGraphCache.put(jobId, archivedExecutionGraph); } @Override public JobsOverview getStoredJobsOverview() { return new JobsOverview(0, numFinishedJobs, numCanceledJobs, numFailedJobs); } @Override public Collection<JobDetails> getAvailableJobDetails() { return jobDetailsCache.asMap().values(); } @Nullable @Override public JobDetails getAvailableJobDetails(JobID jobId) { return jobDetailsCache.getIfPresent(jobId); } @Override public void close() throws IOException { cleanupFuture.cancel(false); jobDetailsCache.invalidateAll(); // clean up the storage directory FileUtils.deleteFileOrDirectory(storageDir); // Remove shutdown hook to prevent resource leaks ShutdownHookUtil.removeShutdownHook(shutdownHook, getClass().getSimpleName(), LOG); } // ————————————————————– // Internal methods // ————————————————————– private int calculateSize(JobID jobId, ArchivedExecutionGraph serializableExecutionGraph) { final File archivedExecutionGraphFile = getExecutionGraphFile(jobId); if (archivedExecutionGraphFile.exists()) { return Math.toIntExact(archivedExecutionGraphFile.length()); } else { LOG.debug(“Could not find archived execution graph file for {}. Estimating the size instead.”, jobId); return serializableExecutionGraph.getAllVertices().size() * 1000 + serializableExecutionGraph.getAccumulatorsSerialized().size() * 1000; } } private ArchivedExecutionGraph loadExecutionGraph(JobID jobId) throws IOException, ClassNotFoundException { final File archivedExecutionGraphFile = getExecutionGraphFile(jobId); if (archivedExecutionGraphFile.exists()) { try (FileInputStream fileInputStream = new FileInputStream(archivedExecutionGraphFile)) { return InstantiationUtil.deserializeObject(fileInputStream, getClass().getClassLoader()); } } else { throw new FileNotFoundException(“Could not find file for archived execution graph " + jobId + “. This indicates that the file either has been deleted or never written.”); } } private void storeArchivedExecutionGraph(ArchivedExecutionGraph archivedExecutionGraph) throws IOException { final File archivedExecutionGraphFile = getExecutionGraphFile(archivedExecutionGraph.getJobID()); try (FileOutputStream fileOutputStream = new FileOutputStream(archivedExecutionGraphFile)) { InstantiationUtil.serializeObject(fileOutputStream, archivedExecutionGraph); } } private File getExecutionGraphFile(JobID jobId) { return new File(storageDir, jobId.toString()); } private void deleteExecutionGraphFile(JobID jobId) { Preconditions.checkNotNull(jobId); final File archivedExecutionGraphFile = getExecutionGraphFile(jobId); try { FileUtils.deleteFileOrDirectory(archivedExecutionGraphFile); } catch (IOException e) { LOG.debug(“Could not delete file {}.”, archivedExecutionGraphFile, e); } archivedExecutionGraphCache.invalidate(jobId); jobDetailsCache.invalidate(jobId); } private static File initExecutionGraphStorageDirectory(File tmpDir) throws IOException { final int maxAttempts = 10; for (int attempt = 0; attempt < maxAttempts; attempt++) { final File storageDirectory = new File(tmpDir, “executionGraphStore-” + UUID.randomUUID()); if (storageDirectory.mkdir()) { return storageDirectory; } } throw new IOException(“Could not create executionGraphStorage directory in " + tmpDir + ‘.’); } // ————————————————————– // Testing methods // ————————————————————– @VisibleForTesting File getStorageDir() { return storageDir; } @VisibleForTesting LoadingCache<JobID, ArchivedExecutionGraph> getArchivedExecutionGraphCache() { return archivedExecutionGraphCache; }}FileArchivedExecutionGraphStore实现了ArchivedExecutionGraphStore接口，它的构造器使用guava cache创建了jobDetailsCache及archivedExecutionGraphCachejobDetailsCache的expireAfterWrite使用的是expirationTime，即使用jobstore.expiration-time配置；archivedExecutionGraphCache的maximumWeight使用的是maximumCacheSizeBytes，即jobstore.cache-size配置FileArchivedExecutionGraphStore还设置了一个定时任务，每隔expirationTime的时间去执行jobDetailsCache的cleanUp方法来清理缓存小结flink的jobstore有两个配置，分别是jobstore.cache-size默认是50M，jobstore.expiration-time默认是1小时SessionClusterEntrypoint的createSerializableExecutionGraphStore方法读取了JobManagerOptions.JOB_STORE_EXPIRATION_TIME及JobManagerOptions.JOB_STORE_CACHE_SIZE配置，然后创建FileArchivedExecutionGraphStoreFileArchivedExecutionGraphStore实现了ArchivedExecutionGraphStore接口，它的构造器使用guava cache创建了jobDetailsCache及archivedExecutionGraphCache；jobDetailsCache的expireAfterWrite使用的是expirationTime，即使用jobstore.expiration-time配置；archivedExecutionGraphCache的maximumWeight使用的是maximumCacheSizeBytes，即jobstore.cache-size配置；它还设置了一个定时任务，每隔expirationTime的时间去执行jobDetailsCache的cleanUp方法来清理缓存docjobstore-cache-sizejobstore-expiration-time ...

聊聊flink的RestClusterClientConfiguration

序本文主要研究一下flink的RestClusterClientConfigurationRestClusterClientConfigurationflink-release-1.7.2/flink-clients/src/main/java/org/apache/flink/client/program/rest/RestClusterClientConfiguration.javapublic final class RestClusterClientConfiguration { private final RestClientConfiguration restClientConfiguration; private final long awaitLeaderTimeout; private final int retryMaxAttempts; private final long retryDelay; private RestClusterClientConfiguration( final RestClientConfiguration endpointConfiguration, final long awaitLeaderTimeout, final int retryMaxAttempts, final long retryDelay) { checkArgument(awaitLeaderTimeout >= 0, “awaitLeaderTimeout must be equal to or greater than 0”); checkArgument(retryMaxAttempts >= 0, “retryMaxAttempts must be equal to or greater than 0”); checkArgument(retryDelay >= 0, “retryDelay must be equal to or greater than 0”); this.restClientConfiguration = Preconditions.checkNotNull(endpointConfiguration); this.awaitLeaderTimeout = awaitLeaderTimeout; this.retryMaxAttempts = retryMaxAttempts; this.retryDelay = retryDelay; } public RestClientConfiguration getRestClientConfiguration() { return restClientConfiguration; } /** * @see RestOptions#AWAIT_LEADER_TIMEOUT / public long getAwaitLeaderTimeout() { return awaitLeaderTimeout; } /* * @see RestOptions#RETRY_MAX_ATTEMPTS / public int getRetryMaxAttempts() { return retryMaxAttempts; } /* * @see RestOptions#RETRY_DELAY / public long getRetryDelay() { return retryDelay; } public static RestClusterClientConfiguration fromConfiguration(Configuration config) throws ConfigurationException { RestClientConfiguration restClientConfiguration = RestClientConfiguration.fromConfiguration(config); final long awaitLeaderTimeout = config.getLong(RestOptions.AWAIT_LEADER_TIMEOUT); final int retryMaxAttempts = config.getInteger(RestOptions.RETRY_MAX_ATTEMPTS); final long retryDelay = config.getLong(RestOptions.RETRY_DELAY); return new RestClusterClientConfiguration(restClientConfiguration, awaitLeaderTimeout, retryMaxAttempts, retryDelay); }}RestClusterClientConfiguration除了RestClientConfiguration外，还有3个属性，分别是awaitLeaderTimeout、retryMaxAttempts、retryDelay；awaitLeaderTimeout读取的是rest.await-leader-timeout配置，默认是30秒；retryMaxAttempts读取的是rest.retry.max-attempts配置，默认是20；retryDelay读取的是rest.retry.delay配置，默认是3秒RestClusterClientflink-release-1.7.2/flink-clients/src/main/java/org/apache/flink/client/program/rest/RestClusterClient.javapublic class RestClusterClient<T> extends ClusterClient<T> implements NewClusterClient { private final RestClusterClientConfiguration restClusterClientConfiguration; private final RestClient restClient; private final ExecutorService executorService = Executors.newFixedThreadPool(4, new ExecutorThreadFactory(“Flink-RestClusterClient-IO”)); private final WaitStrategy waitStrategy; private final T clusterId; private final LeaderRetrievalService webMonitorRetrievalService; private final LeaderRetrievalService dispatcherRetrievalService; private final LeaderRetriever webMonitorLeaderRetriever = new LeaderRetriever(); private final LeaderRetriever dispatcherLeaderRetriever = new LeaderRetriever(); /* ExecutorService to run operations that can be retried on exceptions. */ private ScheduledExecutorService retryExecutorService; //…… private <C> CompletableFuture<C> retry( CheckedSupplier<CompletableFuture<C>> operation, Predicate<Throwable> retryPredicate) { return FutureUtils.retryWithDelay( CheckedSupplier.unchecked(operation), restClusterClientConfiguration.getRetryMaxAttempts(), Time.milliseconds(restClusterClientConfiguration.getRetryDelay()), retryPredicate, new ScheduledExecutorServiceAdapter(retryExecutorService)); } @VisibleForTesting CompletableFuture<URL> getWebMonitorBaseUrl() { return FutureUtils.orTimeout( webMonitorLeaderRetriever.getLeaderFuture(), restClusterClientConfiguration.getAwaitLeaderTimeout(), TimeUnit.MILLISECONDS) .thenApplyAsync(leaderAddressSessionId -> { final String url = leaderAddressSessionId.f0; try { return new URL(url); } catch (MalformedURLException e) { throw new IllegalArgumentException(“Could not parse URL from " + url, e); } }, executorService); } //……}RestClusterClient的构造器会从使用RestClusterClientConfiguration.fromConfiguration(configuration)方法从Configuration构建RestClusterClientConfigurationretry方法内部使用的是FutureUtils.retryWithDelay方法，其retries参数使用的是restClusterClientConfiguration.getRetryMaxAttempts()，retryDelay参数使用的是Time.milliseconds(restClusterClientConfiguration.getRetryDelay())getWebMonitorBaseUrl方法内部使用的是FutureUtils.orTimeout方法，其timeout参数使用的是restClusterClientConfiguration.getAwaitLeaderTimeout()小结RestClusterClientConfiguration除了RestClientConfiguration外，还有3个属性，分别是awaitLeaderTimeout、retryMaxAttempts、retryDelayawaitLeaderTimeout读取的是rest.await-leader-timeout配置，默认是30秒；retryMaxAttempts读取的是rest.retry.max-attempts配置，默认是20；retryDelay读取的是rest.retry.delay配置，默认是3秒RestClusterClient的etry方法内部使用的是FutureUtils.retryWithDelay方法，其retries参数使用的是restClusterClientConfiguration.getRetryMaxAttempts()，retryDelay参数使用的是Time.milliseconds(restClusterClientConfiguration.getRetryDelay())；getWebMonitorBaseUrl方法内部使用的是FutureUtils.orTimeout方法，其timeout参数使用的是restClusterClientConfiguration.getAwaitLeaderTimeout()docRestClusterClientConfiguration ...

聊聊flink的RestClientConfiguration

序本文主要研究一下flink的RestClientConfigurationRestClientConfigurationflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rest/RestClientConfiguration.javapublic final class RestClientConfiguration { @Nullable private final SSLHandlerFactory sslHandlerFactory; private final long connectionTimeout; private final long idlenessTimeout; private final int maxContentLength; private RestClientConfiguration( @Nullable final SSLHandlerFactory sslHandlerFactory, final long connectionTimeout, final long idlenessTimeout, final int maxContentLength) { checkArgument(maxContentLength > 0, “maxContentLength must be positive, was: %d”, maxContentLength); this.sslHandlerFactory = sslHandlerFactory; this.connectionTimeout = connectionTimeout; this.idlenessTimeout = idlenessTimeout; this.maxContentLength = maxContentLength; } /** * Returns the {@link SSLEngine} that the REST client endpoint should use. * * @return SSLEngine that the REST client endpoint should use, or null if SSL was disabled / @Nullable public SSLHandlerFactory getSslHandlerFactory() { return sslHandlerFactory; } /* * {@see RestOptions#CONNECTION_TIMEOUT}. / public long getConnectionTimeout() { return connectionTimeout; } /* * {@see RestOptions#IDLENESS_TIMEOUT}. / public long getIdlenessTimeout() { return idlenessTimeout; } /* * Returns the max content length that the REST client endpoint could handle. * * @return max content length that the REST client endpoint could handle / public int getMaxContentLength() { return maxContentLength; } /* * Creates and returns a new {@link RestClientConfiguration} from the given {@link Configuration}. * * @param config configuration from which the REST client endpoint configuration should be created from * @return REST client endpoint configuration * @throws ConfigurationException if SSL was configured incorrectly */ public static RestClientConfiguration fromConfiguration(Configuration config) throws ConfigurationException { Preconditions.checkNotNull(config); final SSLHandlerFactory sslHandlerFactory; if (SSLUtils.isRestSSLEnabled(config)) { try { sslHandlerFactory = SSLUtils.createRestClientSSLEngineFactory(config); } catch (Exception e) { throw new ConfigurationException(“Failed to initialize SSLContext for the REST client”, e); } } else { sslHandlerFactory = null; } final long connectionTimeout = config.getLong(RestOptions.CONNECTION_TIMEOUT); final long idlenessTimeout = config.getLong(RestOptions.IDLENESS_TIMEOUT); int maxContentLength = config.getInteger(RestOptions.CLIENT_MAX_CONTENT_LENGTH); return new RestClientConfiguration(sslHandlerFactory, connectionTimeout, idlenessTimeout, maxContentLength); }}RestClientConfiguration有四个属性，分别是sslHandlerFactory、connectionTimeout、idlenessTimeout、maxContentLengthfromConfiguration方法从Configuration中创建SSLHandlerFactory，其读取的是相关配置有security.ssl.rest.enabled，默认为false；security.ssl.protocol，默认为TLSv1.2；security.ssl.algorithms，默认为TLS_RSA_WITH_AES_128_CBC_SHA；security.ssl.rest.authentication-enabled，默认为falseconnectionTimeout读取的是rest.connection-timeout配置，默认是15000毫秒；idlenessTimeout读取的是rest.idleness-timeout配置，默认5分钟；maxContentLength读取的是rest.client.max-content-length配置，默认是104_857_600RestClientflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rest/RestClient.javapublic class RestClient implements AutoCloseableAsync { private static final Logger LOG = LoggerFactory.getLogger(RestClient.class); private static final ObjectMapper objectMapper = RestMapperUtils.getStrictObjectMapper(); // used to open connections to a rest server endpoint private final Executor executor; private final Bootstrap bootstrap; private final CompletableFuture<Void> terminationFuture; private final AtomicBoolean isRunning = new AtomicBoolean(true); public RestClient(RestClientConfiguration configuration, Executor executor) { Preconditions.checkNotNull(configuration); this.executor = Preconditions.checkNotNull(executor); this.terminationFuture = new CompletableFuture<>(); final SSLHandlerFactory sslHandlerFactory = configuration.getSslHandlerFactory(); ChannelInitializer<SocketChannel> initializer = new ChannelInitializer<SocketChannel>() { @Override protected void initChannel(SocketChannel socketChannel) { try { // SSL should be the first handler in the pipeline if (sslHandlerFactory != null) { socketChannel.pipeline().addLast(“ssl”, sslHandlerFactory.createNettySSLHandler()); } socketChannel.pipeline() .addLast(new HttpClientCodec()) .addLast(new HttpObjectAggregator(configuration.getMaxContentLength())) .addLast(new ChunkedWriteHandler()) // required for multipart-requests .addLast(new IdleStateHandler(configuration.getIdlenessTimeout(), configuration.getIdlenessTimeout(), configuration.getIdlenessTimeout(), TimeUnit.MILLISECONDS)) .addLast(new ClientHandler()); } catch (Throwable t) { t.printStackTrace(); ExceptionUtils.rethrow(t); } } }; NioEventLoopGroup group = new NioEventLoopGroup(1, new ExecutorThreadFactory(“flink-rest-client-netty”)); bootstrap = new Bootstrap(); bootstrap .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, Math.toIntExact(configuration.getConnectionTimeout())) .group(group) .channel(NioSocketChannel.class) .handler(initializer); LOG.info(“Rest client endpoint started.”); } @Override public CompletableFuture<Void> closeAsync() { return shutdownInternally(Time.seconds(10L)); } public void shutdown(Time timeout) { final CompletableFuture<Void> shutDownFuture = shutdownInternally(timeout); try { shutDownFuture.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS); LOG.info(“Rest endpoint shutdown complete.”); } catch (Exception e) { LOG.warn(“Rest endpoint shutdown failed.”, e); } } private CompletableFuture<Void> shutdownInternally(Time timeout) { if (isRunning.compareAndSet(true, false)) { LOG.info(“Shutting down rest endpoint.”); if (bootstrap != null) { if (bootstrap.group() != null) { bootstrap.group().shutdownGracefully(0L, timeout.toMilliseconds(), TimeUnit.MILLISECONDS) .addListener(finished -> { if (finished.isSuccess()) { terminationFuture.complete(null); } else { terminationFuture.completeExceptionally(finished.cause()); } }); } } } return terminationFuture; } //……}RestClient的构造器接收RestClientConfiguration及Executor两个参数，构造器里头创建了netty的Bootstrap，其中ChannelOption.CONNECT_TIMEOUT_MILLIS使用的是configuration.getConnectionTimeout()；IdleStateHandler的readerIdleTime、writerIdleTime、allIdleTime使用的是configuration.getIdlenessTimeout()；HttpObjectAggregator的maxContentLength使用的是configuration.getMaxContentLength()；SSLHandlerFactory使用的是configuration.getSslHandlerFactory()小结RestClientConfiguration有四个属性，分别是sslHandlerFactory、connectionTimeout、idlenessTimeout、maxContentLength；fromConfiguration方法从Configuration中创建SSLHandlerFactory，其读取的是相关配置有security.ssl.rest.enabled，默认为false；security.ssl.protocol，默认为TLSv1.2；security.ssl.algorithms，默认为TLS_RSA_WITH_AES_128_CBC_SHA；security.ssl.rest.authentication-enabled，默认为falseconnectionTimeout读取的是rest.connection-timeout配置，默认是15000毫秒；idlenessTimeout读取的是rest.idleness-timeout配置，默认5分钟；maxContentLength读取的是rest.client.max-content-length配置，默认是104_857_600RestClient的构造器接收RestClientConfiguration及Executor两个参数，构造器里头创建了netty的Bootstrap，其中ChannelOption.CONNECT_TIMEOUT_MILLIS使用的是configuration.getConnectionTimeout()；IdleStateHandler的readerIdleTime、writerIdleTime、allIdleTime使用的是configuration.getIdlenessTimeout()；HttpObjectAggregator的maxContentLength使用的是configuration.getMaxContentLength()；SSLHandlerFactory使用的是configuration.getSslHandlerFactory()docRestClientConfiguration ...

基于实时计算（Flink）与高斯模型构建实时异常检测系统

案例与解决方案汇总页：阿里云实时计算产品案例&解决方案汇总1. 概述异常检测（anomaly detection）指的是对不符合预期模式或数据集（英语：dataset）中其他项目的项目、事件或观测值的识别。实际应用包括入侵检测、欺诈检测、故障检测、系统健康监测、传感器网络事件检测和生态系统干扰检测等。之前我曾经介绍过一种异常检测的解决方案《准实时异常检测系统》，但那个架构中Flink主要承担的还是检测后的分析，真正的异常检测被前置到了业务系统中。在本文中，我将介绍一种直接使用Flink做实时异常检测的方案。2. 异常检测算法2.1 异常的种类异常（离群点）分为三种类型：全局离群点，最基本的异常，即一个单独的远离群体的点；情境（或条件）离群点，该点在全局不算异常，但在某个上下文中却是异常的，比如人的性别为男性不是异常，但如果选定范围为女厕所，那么这个就是异常的；集体离群点，单个点不算异常，但一系列组合起来却是异常。比如偶尔的服务延迟不是异常，但如果整个系统大部分服务都延迟那就是异常。本文以说明基本原理为主，所以使用最简单的全局离群点做例子，即只关注检测某个单独的事件是否偏离正常。完成的异常分类可参考：这里2.2 异常监测算法关于异常检测有大量的算法，详细理论可参考scikit-learn的《 Novelty and Outlier Detection》一章。本文选取最简单的一种算法，基于高斯分布分布的异常检测算法。假设我们已经有了一组正常数据，x(1),x(2),..,x(m)，那么针对新的数据x，我们判断这个x是否正常，可以计算x在正常数据中出现的概率如何，如果x出现的概率大于某个阈值，则为正常，否则即为异常，这种方法叫做密度估计。那么我们可以假设，这些数据遵循高斯分布（正态分布），那么对某个特定的值来说，其在高斯分布的中间部分是比较正常的，在两端可能是异常的。通常如果我们认为变量 x 符合高斯分布 x~N(,2)，则其概率密度函数为：异常检测算法的步骤为：对于给定的数据集 x(1),x(2),…,x(m)，针对每一个特征计算和 2 的估计值，计算方法如下。一旦我们获得了每个特征的平均值和方差的估计值，给定新的一个训练实例，根据模型计算每一特征的概率再相乘得到整体的概率：注：可能你要检测的事件只有一个特征，那么很显然就不用再乘了。选择一个阈值，将 p(x)= 作为判定边界，当 p(x)> 时预测数据为正常数据，否则为异常，这样就完成了整个异常检测过程。注：阈值的选择可以直接估算一个，也可以简单训练得出，具体训练方式这里不再赘述，可参考这里。总结一下，其实整个模型我们只需要计算正常数据中每个特征的平均值和方差，再加上最终整体阈值，所以模型是非常小的，完全可以把这些数据计算出来后随代码一起发布。（当然从解耦性来说，最好能够独立存储，通过注册或配置的方式来发布）3. 基于Flink和高斯分布的实时异常检测系统前面介绍了异常检测的基本算法，那么本小节我们就基于Flink和高斯分布设计一个实时异常检测系统。假设你是一个公司的运维人员，负责管理全公司的IT资源，为了保证公司IT稳定性，提前发现主机或者系统的问题，你设计了这样一个实时异常检测系统。系统采用Kapp架构，关于Kappa架构的说明可参考：数据仓库介绍与实时数仓案例。系统架构与所选软件如下图所示：数据源包括两个部分，主机运行信息与系统的运行日志，主机运行信息通过collectd 收集，系统运行日志通过Filebeat收集，二者均将数据推送到Kafka。数据通过Kafka流转，支持Flink计算过程中的实时分层。最终数据存储到Elastic Search中，并通过KIBANA可视化。异常检测由实时计算Flink完成，计算过程很简单：数据清洗，把原始数据格式化；计算特征值，计算所选事件的特征，比如某个服务打印日志的频率就是一个特征，假如系统调用失败，则会打印一条失败记录，那么当系统打印失败记录的频率变高时，系统可能出现了问题；计算特征统计值，即找到该特征的高斯分布（确定平均值和方差即可确定高斯分布）；这里高斯分布直接在线计算，好处是随时可更新，没有显式的训练过程，缺点是可能受异常数据影响。另外一种方式是离线选取一些正常数据然后计算高斯分布。检测异常值，利用2.2节中的算法原理检测异常事件；输出，最后把检测出的异常数据写到下游；好了，一个简单的实时异常检测系统就完成了。4. 总结在本文中，在Kappa架构上添加简单的异常检测算法即可完成一个简单有效的实时异常检测系统。该架构具备良好的可扩展性，基于Flink的Kappa架构让系统能够应对超大规模数据流，并且能够在数据流转的过程中完成处理。此外，虽然本文中直接把异常检测算法内置到了Flink的逻辑中，但实际的应用中很容易把通过算法API的方式让系统解耦：算法团队训练并提供一个用以判别某个事件或特征是否异常的算法服务，在Flink的处理过程中计算好特征值之后调用这个算法服务进行检测。该方案来自真实的案例，如果有兴趣进一步了解，可参考下边资料：准实时异常检测系统数据仓库介绍与实时数仓案例实时欺诈检测（风控）Novelty and Outlier DetectionApplying the Kappa architecture in the telco industry，Kappa architecture and Bayesian models yield quick, accurate analytics in cloud monitoring systems.Anomaly detection高斯分布、异常检测Outlier Analysis: A Quick Guide to the Different Types of Outliers本文作者：付空阅读原文本文为云栖社区原创内容，未经允许不得转载。

聊聊flink的DualKeyMap

序本文主要研究一下flink的DualKeyMap实例 @Test public void testKeySets() { final Random random = new Random(); final int capacity = 10; final Set<Tuple2<Integer, Integer>> keys = new HashSet<>(capacity); for (int i = 0; i < capacity; i++) { int keyA = random.nextInt(); int keyB = random.nextInt(); keys.add(Tuple2.of(keyA, keyB)); } final DualKeyMap<Integer, Integer, String> dualKeyMap = new DualKeyMap<>(capacity); for (Tuple2<Integer, Integer> key : keys) { dualKeyMap.put(key.f0, key.f1, “foobar”); } assertThat(dualKeyMap.keySetA(), Matchers.equalTo(keys.stream().map(t -> t.f0).collect(Collectors.toSet()))); assertThat(dualKeyMap.keySetB(), Matchers.equalTo(keys.stream().map(t -> t.f1).collect(Collectors.toSet()))); }DualKeyMap有两个key，put值的时候，需要指定keyA及keyBDualKeyMapflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/jobmaster/slotpool/DualKeyMap.javapublic class DualKeyMap<A, B, V> { private final HashMap<A, Tuple2<B, V>> aMap; private final HashMap<B, A> bMap; private transient Collection<V> values; public DualKeyMap(int initialCapacity) { this.aMap = new HashMap<>(initialCapacity); this.bMap = new HashMap<>(initialCapacity); } public int size() { return aMap.size(); } public V getKeyA(A aKey) { final Tuple2<B, V> value = aMap.get(aKey); if (value != null) { return value.f1; } else { return null; } } public V getKeyB(B bKey) { final A aKey = bMap.get(bKey); if (aKey != null) { return aMap.get(aKey).f1; } else { return null; } } public V put(A aKey, B bKey, V value) { Tuple2<B, V> aValue = aMap.put(aKey, Tuple2.of(bKey, value)); bMap.put(bKey, aKey); if (aValue != null) { return aValue.f1; } else { return null; } } public boolean containsKeyA(A aKey) { return aMap.containsKey(aKey); } public boolean containsKeyB(B bKey) { return bMap.containsKey(bKey); } public V removeKeyA(A aKey) { Tuple2<B, V> aValue = aMap.remove(aKey); if (aValue != null) { bMap.remove(aValue.f0); return aValue.f1; } else { return null; } } public V removeKeyB(B bKey) { A aKey = bMap.remove(bKey); if (aKey != null) { Tuple2<B, V> aValue = aMap.remove(aKey); if (aValue != null) { return aValue.f1; } else { return null; } } else { return null; } } public Collection<V> values() { Collection<V> vs = values; if (vs == null) { vs = new Values(); values = vs; } return vs; } public Set<A> keySetA() { return aMap.keySet(); } public Set keySetB() { return bMap.keySet(); } public void clear() { aMap.clear(); bMap.clear(); } // ———————————————————————– // Inner classes // ———————————————————————– /** * Collection which contains the values of the dual key map. / private final class Values extends AbstractCollection<V> { @Override public Iterator<V> iterator() { return new ValueIterator(); } @Override public int size() { return aMap.size(); } } /* * Iterator which iterates over the values of the dual key map. */ private final class ValueIterator implements Iterator<V> { private final Iterator<Tuple2<B, V>> iterator = aMap.values().iterator(); @Override public boolean hasNext() { return iterator.hasNext(); } @Override public V next() { Tuple2<B, V> value = iterator.next(); return value.f1; } }}DualKeyMap定义了三个泛型，分别是A，B，V，即keyA，keyB，value的泛型；它维护了两个HashMap，其中aMap的key为keyA，value为Tuple2<B, V>；bMap的key为keyB，value为keyADualKeyMap提供了getKeyA、getKeyB、containsKeyA、containsKeyB、removeKeyA、removeKeyB、keySetA、keySetB、size、put、values、clear方法values方法返回的是Values，它继承了AbstractCollection，它的iterator方法返回的是ValueIterator；ValueIterator实现了Iterator接口，其内部使用的是aMap.values().iterator()小结DualKeyMap定义了三个泛型，分别是A，B，V，即keyA，keyB，value的泛型；它维护了两个HashMap，其中aMap的key为keyA，value为Tuple2<B, V>；bMap的key为keyB，value为keyADualKeyMap提供了getKeyA、getKeyB、containsKeyA、containsKeyB、removeKeyA、removeKeyB、keySetA、keySetB、size、put、values、clear方法；put值的时候，需要指定keyA及keyBvalues方法返回的是Values，它继承了AbstractCollection，它的iterator方法返回的是ValueIterator；ValueIterator实现了Iterator接口，其内部使用的是aMap.values().iterator()docDualKeyMap ...

聊聊flink的slot.idle.timeout配置

序本文主要研究一下flink的slot.idle.timeout配置JobManagerOptionsflink-release-1.7.2/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java@PublicEvolvingpublic class JobManagerOptions { //…… /** * The timeout in milliseconds for a idle slot in Slot Pool. / public static final ConfigOption<Long> SLOT_IDLE_TIMEOUT = key(“slot.idle.timeout”) // default matches heartbeat.timeout so that sticky allocation is not lost on timeouts for local recovery .defaultValue(HeartbeatManagerOptions.HEARTBEAT_TIMEOUT.defaultValue()) .withDescription(“The timeout in milliseconds for a idle slot in Slot Pool.”); //……}slot.idle.timeout默认为HeartbeatManagerOptions.HEARTBEAT_TIMEOUT.defaultValue()，即50000L毫秒SlotPoolflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/jobmaster/slotpool/SlotPool.javapublic class SlotPool extends RpcEndpoint implements SlotPoolGateway, AllocatedSlotActions { /* The interval (in milliseconds) in which the SlotPool writes its slot distribution on debug level. / private static final int STATUS_LOG_INTERVAL_MS = 60_000; private final JobID jobId; private final SchedulingStrategy schedulingStrategy; private final ProviderAndOwner providerAndOwner; /* All registered TaskManagers, slots will be accepted and used only if the resource is registered. / private final HashSet<ResourceID> registeredTaskManagers; /* The book-keeping of all allocated slots. / private final AllocatedSlots allocatedSlots; /* The book-keeping of all available slots. / private final AvailableSlots availableSlots; /* All pending requests waiting for slots. / private final DualKeyMap<SlotRequestId, AllocationID, PendingRequest> pendingRequests; /* The requests that are waiting for the resource manager to be connected. / private final HashMap<SlotRequestId, PendingRequest> waitingForResourceManager; /* Timeout for external request calls (e.g. to the ResourceManager or the TaskExecutor). / private final Time rpcTimeout; /* Timeout for releasing idle slots. / private final Time idleSlotTimeout; private final Clock clock; /* Managers for the different slot sharing groups. / protected final Map<SlotSharingGroupId, SlotSharingManager> slotSharingManagers; /* the fencing token of the job manager. / private JobMasterId jobMasterId; /* The gateway to communicate with resource manager. / private ResourceManagerGateway resourceManagerGateway; private String jobManagerAddress; //…… /* * Start the slot pool to accept RPC calls. * * @param jobMasterId The necessary leader id for running the job. * @param newJobManagerAddress for the slot requests which are sent to the resource manager / public void start(JobMasterId jobMasterId, String newJobManagerAddress) throws Exception { this.jobMasterId = checkNotNull(jobMasterId); this.jobManagerAddress = checkNotNull(newJobManagerAddress); // TODO - start should not throw an exception try { super.start(); } catch (Exception e) { throw new RuntimeException(“This should never happen”, e); } scheduleRunAsync(this::checkIdleSlot, idleSlotTimeout); if (log.isDebugEnabled()) { scheduleRunAsync(this::scheduledLogStatus, STATUS_LOG_INTERVAL_MS, TimeUnit.MILLISECONDS); } } /* * Check the available slots, release the slot that is idle for a long time. / private void checkIdleSlot() { // The timestamp in SlotAndTimestamp is relative final long currentRelativeTimeMillis = clock.relativeTimeMillis(); final List<AllocatedSlot> expiredSlots = new ArrayList<>(availableSlots.size()); for (SlotAndTimestamp slotAndTimestamp : availableSlots.availableSlots.values()) { if (currentRelativeTimeMillis - slotAndTimestamp.timestamp > idleSlotTimeout.toMilliseconds()) { expiredSlots.add(slotAndTimestamp.slot); } } final FlinkException cause = new FlinkException(“Releasing idle slot.”); for (AllocatedSlot expiredSlot : expiredSlots) { final AllocationID allocationID = expiredSlot.getAllocationId(); if (availableSlots.tryRemove(allocationID) != null) { log.info(“Releasing idle slot [{}].”, allocationID); final CompletableFuture<Acknowledge> freeSlotFuture = expiredSlot.getTaskManagerGateway().freeSlot( allocationID, cause, rpcTimeout); freeSlotFuture.whenCompleteAsync( (Acknowledge ignored, Throwable throwable) -> { if (throwable != null) { if (registeredTaskManagers.contains(expiredSlot.getTaskManagerId())) { log.debug(“Releasing slot [{}] of registered TaskExecutor {} failed. " + “Trying to fulfill a different slot request.”, allocationID, expiredSlot.getTaskManagerId(), throwable); tryFulfillSlotRequestOrMakeAvailable(expiredSlot); } else { log.debug(“Releasing slot [{}] failed and owning TaskExecutor {} is no " + “longer registered. Discarding slot.”, allocationID, expiredSlot.getTaskManagerId()); } } }, getMainThreadExecutor()); } } scheduleRunAsync(this::checkIdleSlot, idleSlotTimeout); } //……}SlotPool在start方法里头，调用scheduleRunAsync方法，延时idleSlotTimeout调度执行checkIdleSlot；checkIdleSlot方法会挨个检查availableSlots的SlotAndTimestamp，判断当前时间与slotAndTimestamp.timestamp的时间差是否超过idleSlotTimeout，超过的话，则放入expiredSlots，之后对expiredSlots挨个进行availableSlots.tryRemove，然后调用TaskManagerGateway.freeSlot进行释放，之后再次调用scheduleRunAsync(this::checkIdleSlot, idleSlotTimeout)进行下一次的延时调度检测RpcEndpointflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/rpc/RpcEndpoint.javapublic abstract class RpcEndpoint implements RpcGateway { //…… /* * Execute the runnable in the main thread of the underlying RPC endpoint, with * a delay of the given number of milliseconds. * * @param runnable Runnable to be executed * @param delay The delay after which the runnable will be executed / protected void scheduleRunAsync(Runnable runnable, Time delay) { scheduleRunAsync(runnable, delay.getSize(), delay.getUnit()); } /* * Execute the runnable in the main thread of the underlying RPC endpoint, with * a delay of the given number of milliseconds. * * @param runnable Runnable to be executed * @param delay The delay after which the runnable will be executed */ protected void scheduleRunAsync(Runnable runnable, long delay, TimeUnit unit) { rpcServer.scheduleRunAsync(runnable, unit.toMillis(delay)); } //……}RpcEndpoint提供了scheduleRunAsync，其最后调用的是rpcServer.scheduleRunAsync小结slot.idle.timeout默认为HeartbeatManagerOptions.HEARTBEAT_TIMEOUT.defaultValue()，即50000L毫秒SlotPool在start方法里头，调用scheduleRunAsync方法，延时idleSlotTimeout调度执行checkIdleSlot；checkIdleSlot方法会挨个检查availableSlots的SlotAndTimestamp，判断当前时间与slotAndTimestamp.timestamp的时间差是否超过idleSlotTimeout，超过的话，则放入expiredSlots，之后对expiredSlots挨个进行availableSlots.tryRemove，然后调用TaskManagerGateway.freeSlot进行释放，之后再次调用scheduleRunAsync(this::checkIdleSlot, idleSlotTimeout)进行下一次的延时调度检测RpcEndpoint提供了scheduleRunAsync，其最后调用的是rpcServer.scheduleRunAsyncdocslot-idle-timeout ...

聊聊flink的slot.request.timeout配置

序本文主要研究一下flink的slot.request.timeout配置JobManagerOptionsflink-release-1.7.2/flink-core/src/main/java/org/apache/flink/configuration/JobManagerOptions.java@PublicEvolvingpublic class JobManagerOptions { //…… /** * The timeout in milliseconds for requesting a slot from Slot Pool. / public static final ConfigOption<Long> SLOT_REQUEST_TIMEOUT = key(“slot.request.timeout”) .defaultValue(5L * 60L * 1000L) .withDescription(“The timeout in milliseconds for requesting a slot from Slot Pool.”); //……}slot.request.timeout默认为5分钟SlotManagerConfigurationflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/resourcemanager/slotmanager/SlotManagerConfiguration.javapublic class SlotManagerConfiguration { private static final Logger LOGGER = LoggerFactory.getLogger(SlotManagerConfiguration.class); private final Time taskManagerRequestTimeout; private final Time slotRequestTimeout; private final Time taskManagerTimeout; public SlotManagerConfiguration( Time taskManagerRequestTimeout, Time slotRequestTimeout, Time taskManagerTimeout) { this.taskManagerRequestTimeout = Preconditions.checkNotNull(taskManagerRequestTimeout); this.slotRequestTimeout = Preconditions.checkNotNull(slotRequestTimeout); this.taskManagerTimeout = Preconditions.checkNotNull(taskManagerTimeout); } public Time getTaskManagerRequestTimeout() { return taskManagerRequestTimeout; } public Time getSlotRequestTimeout() { return slotRequestTimeout; } public Time getTaskManagerTimeout() { return taskManagerTimeout; } public static SlotManagerConfiguration fromConfiguration(Configuration configuration) throws ConfigurationException { final String strTimeout = configuration.getString(AkkaOptions.ASK_TIMEOUT); final Time rpcTimeout; try { rpcTimeout = Time.milliseconds(Duration.apply(strTimeout).toMillis()); } catch (NumberFormatException e) { throw new ConfigurationException(“Could not parse the resource manager’s timeout " + “value " + AkkaOptions.ASK_TIMEOUT + ‘.’, e); } final Time slotRequestTimeout = getSlotRequestTimeout(configuration); final Time taskManagerTimeout = Time.milliseconds( configuration.getLong(ResourceManagerOptions.TASK_MANAGER_TIMEOUT)); return new SlotManagerConfiguration(rpcTimeout, slotRequestTimeout, taskManagerTimeout); } private static Time getSlotRequestTimeout(final Configuration configuration) { final long slotRequestTimeoutMs; if (configuration.contains(ResourceManagerOptions.SLOT_REQUEST_TIMEOUT)) { LOGGER.warn(“Config key {} is deprecated; use {} instead.”, ResourceManagerOptions.SLOT_REQUEST_TIMEOUT, JobManagerOptions.SLOT_REQUEST_TIMEOUT); slotRequestTimeoutMs = configuration.getLong(ResourceManagerOptions.SLOT_REQUEST_TIMEOUT); } else { slotRequestTimeoutMs = configuration.getLong(JobManagerOptions.SLOT_REQUEST_TIMEOUT); } return Time.milliseconds(slotRequestTimeoutMs); }}SlotManagerConfiguration的getSlotRequestTimeout方法会从配置文件读取JobManagerOptions.SLOT_REQUEST_TIMEOUTSlotManagerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/resourcemanager/slotmanager/SlotManager.javapublic class SlotManager implements AutoCloseable { private static final Logger LOG = LoggerFactory.getLogger(SlotManager.class); /* Scheduled executor for timeouts. / private final ScheduledExecutor scheduledExecutor; /* Timeout for slot requests to the task manager. / private final Time taskManagerRequestTimeout; /* Timeout after which an allocation is discarded. / private final Time slotRequestTimeout; /* Timeout after which an unused TaskManager is released. / private final Time taskManagerTimeout; /* Map for all registered slots. / private final HashMap<SlotID, TaskManagerSlot> slots; /* Index of all currently free slots. / private final LinkedHashMap<SlotID, TaskManagerSlot> freeSlots; /* All currently registered task managers. / private final HashMap<InstanceID, TaskManagerRegistration> taskManagerRegistrations; /* Map of fulfilled and active allocations for request deduplication purposes. / private final HashMap<AllocationID, SlotID> fulfilledSlotRequests; /* Map of pending/unfulfilled slot allocation requests. / private final HashMap<AllocationID, PendingSlotRequest> pendingSlotRequests; private final HashMap<TaskManagerSlotId, PendingTaskManagerSlot> pendingSlots; /* ResourceManager’s id. / private ResourceManagerId resourceManagerId; /* Executor for future callbacks which have to be “synchronized”. / private Executor mainThreadExecutor; /* Callbacks for resource (de-)allocations. / private ResourceActions resourceActions; private ScheduledFuture<?> taskManagerTimeoutCheck; private ScheduledFuture<?> slotRequestTimeoutCheck; /* True iff the component has been started. / private boolean started; public SlotManager( ScheduledExecutor scheduledExecutor, Time taskManagerRequestTimeout, Time slotRequestTimeout, Time taskManagerTimeout) { this.scheduledExecutor = Preconditions.checkNotNull(scheduledExecutor); this.taskManagerRequestTimeout = Preconditions.checkNotNull(taskManagerRequestTimeout); this.slotRequestTimeout = Preconditions.checkNotNull(slotRequestTimeout); this.taskManagerTimeout = Preconditions.checkNotNull(taskManagerTimeout); slots = new HashMap<>(16); freeSlots = new LinkedHashMap<>(16); taskManagerRegistrations = new HashMap<>(4); fulfilledSlotRequests = new HashMap<>(16); pendingSlotRequests = new HashMap<>(16); pendingSlots = new HashMap<>(16); resourceManagerId = null; resourceActions = null; mainThreadExecutor = null; taskManagerTimeoutCheck = null; slotRequestTimeoutCheck = null; started = false; } public void start(ResourceManagerId newResourceManagerId, Executor newMainThreadExecutor, ResourceActions newResourceActions) { LOG.info(“Starting the SlotManager.”); this.resourceManagerId = Preconditions.checkNotNull(newResourceManagerId); mainThreadExecutor = Preconditions.checkNotNull(newMainThreadExecutor); resourceActions = Preconditions.checkNotNull(newResourceActions); started = true; taskManagerTimeoutCheck = scheduledExecutor.scheduleWithFixedDelay( () -> mainThreadExecutor.execute( () -> checkTaskManagerTimeouts()), 0L, taskManagerTimeout.toMilliseconds(), TimeUnit.MILLISECONDS); slotRequestTimeoutCheck = scheduledExecutor.scheduleWithFixedDelay( () -> mainThreadExecutor.execute( () -> checkSlotRequestTimeouts()), 0L, slotRequestTimeout.toMilliseconds(), TimeUnit.MILLISECONDS); } /* * Suspends the component. This clears the internal state of the slot manager. */ public void suspend() { LOG.info(“Suspending the SlotManager.”); // stop the timeout checks for the TaskManagers and the SlotRequests if (taskManagerTimeoutCheck != null) { taskManagerTimeoutCheck.cancel(false); taskManagerTimeoutCheck = null; } if (slotRequestTimeoutCheck != null) { slotRequestTimeoutCheck.cancel(false); slotRequestTimeoutCheck = null; } for (PendingSlotRequest pendingSlotRequest : pendingSlotRequests.values()) { cancelPendingSlotRequest(pendingSlotRequest); } pendingSlotRequests.clear(); ArrayList<InstanceID> registeredTaskManagers = new ArrayList<>(taskManagerRegistrations.keySet()); for (InstanceID registeredTaskManager : registeredTaskManagers) { unregisterTaskManager(registeredTaskManager); } resourceManagerId = null; resourceActions = null; started = false; } public boolean registerSlotRequest(SlotRequest slotRequest) throws SlotManagerException { checkInit(); if (checkDuplicateRequest(slotRequest.getAllocationId())) { LOG.debug(“Ignoring a duplicate slot request with allocation id {}.”, slotRequest.getAllocationId()); return false; } else { PendingSlotRequest pendingSlotRequest = new PendingSlotRequest(slotRequest); pendingSlotRequests.put(slotRequest.getAllocationId(), pendingSlotRequest); try { internalRequestSlot(pendingSlotRequest); } catch (ResourceManagerException e) { // requesting the slot failed –> remove pending slot request pendingSlotRequests.remove(slotRequest.getAllocationId()); throw new SlotManagerException(“Could not fulfill slot request " + slotRequest.getAllocationId() + ‘.’, e); } return true; } } private void checkSlotRequestTimeouts() { if (!pendingSlotRequests.isEmpty()) { long currentTime = System.currentTimeMillis(); Iterator<Map.Entry<AllocationID, PendingSlotRequest>> slotRequestIterator = pendingSlotRequests.entrySet().iterator(); while (slotRequestIterator.hasNext()) { PendingSlotRequest slotRequest = slotRequestIterator.next().getValue(); if (currentTime - slotRequest.getCreationTimestamp() >= slotRequestTimeout.toMilliseconds()) { slotRequestIterator.remove(); if (slotRequest.isAssigned()) { cancelPendingSlotRequest(slotRequest); } resourceActions.notifyAllocationFailure( slotRequest.getJobId(), slotRequest.getAllocationId(), new TimeoutException(“The allocation could not be fulfilled in time.”)); } } } } //……}SlotManager的构造器接收slotRequestTimeout参数；它维护了pendingSlotRequests的map；start方法会注册slotRequestTimeoutCheck，每隔slotRequestTimeout的时间调度一次，执行的是checkSlotRequestTimeouts方法；suspend方法会cancel这些pendingSlotRequest，然后情况pendingSlotRequests的mapregisterSlotRequest方法会先执行checkDuplicateRequest判断是否有重复，没有重复的话，则将该slotRequest维护到pendingSlotRequests，然后调用internalRequestSlot进行分配，如果出现异常则从pendingSlotRequests中异常，然后抛出SlotManagerExceptioncheckSlotRequestTimeouts则会遍历pendingSlotRequests，然后根据slotRequest.getCreationTimestamp()及当前时间判断时间差是否大于等于slotRequestTimeout，已经超时的话，则会从pendingSlotRequests中移除该slotRequest，然后进行cancel，同时触发resourceActions.notifyAllocationFailure小结SlotManagerConfiguration的getSlotRequestTimeout方法会从配置文件读取JobManagerOptions.SLOT_REQUEST_TIMEOUT；slot.request.timeout默认为5分钟SlotManager的构造器接收slotRequestTimeout参数；它维护了pendingSlotRequests的map；start方法会注册slotRequestTimeoutCheck，每隔slotRequestTimeout的时间调度一次，执行的是checkSlotRequestTimeouts方法；suspend方法会cancel这些pendingSlotRequest，然后情况pendingSlotRequests的mapregisterSlotRequest方法会先执行checkDuplicateRequest判断是否有重复，没有重复的话，则将该slotRequest维护到pendingSlotRequests，然后调用internalRequestSlot进行分配，如果出现异常则从pendingSlotRequests中异常，然后抛出SlotManagerException；checkSlotRequestTimeouts则会遍历pendingSlotRequests，然后根据slotRequest.getCreationTimestamp()及当前时间判断时间差是否大于等于slotRequestTimeout，已经超时的话，则会从pendingSlotRequests中移除该slotRequest，然后进行cancel，同时触发resourceActions.notifyAllocationFailuredocslot-request-timeout ...

聊聊flink的BlobStoreService

序本文主要研究一下flink的BlobStoreServiceBlobViewflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/BlobView.javapublic interface BlobView { /** * Copies a blob to a local file. * * @param jobId ID of the job this blob belongs to (or <tt>null</tt> if job-unrelated) * @param blobKey The blob ID * @param localFile The local file to copy to * * @return whether the file was copied (<tt>true</tt>) or not (<tt>false</tt>) * @throws IOException If the copy fails / boolean get(JobID jobId, BlobKey blobKey, File localFile) throws IOException;}BlobView定义了get方法，将指定的blob拷贝到localFileBlobStoreflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/BlobStore.javapublic interface BlobStore extends BlobView { /* * Copies the local file to the blob store. * * @param localFile The file to copy * @param jobId ID of the job this blob belongs to (or <tt>null</tt> if job-unrelated) * @param blobKey The ID for the file in the blob store * * @return whether the file was copied (<tt>true</tt>) or not (<tt>false</tt>) * @throws IOException If the copy fails / boolean put(File localFile, JobID jobId, BlobKey blobKey) throws IOException; /* * Tries to delete a blob from storage. * * NOTE: This also tries to delete any created directories if empty. * * @param jobId ID of the job this blob belongs to (or <tt>null</tt> if job-unrelated) * @param blobKey The blob ID * * @return <tt>true</tt> if the given blob is successfully deleted or non-existing; * <tt>false</tt> otherwise / boolean delete(JobID jobId, BlobKey blobKey); /* * Tries to delete all blobs for the given job from storage. * * NOTE: This also tries to delete any created directories if empty. * * @param jobId The JobID part of all blobs to delete * * @return <tt>true</tt> if the job directory is successfully deleted or non-existing; * <tt>false</tt> otherwise / boolean deleteAll(JobID jobId);}BlobStore继承了BlobView，它定义了put、delete、deleteAll方法BlobStoreServiceflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/BlobStoreService.javapublic interface BlobStoreService extends BlobStore, Closeable { /* * Closes and cleans up the store. This entails the deletion of all blobs. / void closeAndCleanupAllData();}BlobStoreService继承了BlobStore及Closeable接口，它定义了closeAndCleanupAllData方法；它有两个实现类，分别是VoidBlobStore、FileSystemBlobStoreVoidBlobStoreflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/VoidBlobStore.javapublic class VoidBlobStore implements BlobStoreService { @Override public boolean put(File localFile, JobID jobId, BlobKey blobKey) throws IOException { return false; } @Override public boolean get(JobID jobId, BlobKey blobKey, File localFile) throws IOException { return false; } @Override public boolean delete(JobID jobId, BlobKey blobKey) { return true; } @Override public boolean deleteAll(JobID jobId) { return true; } @Override public void closeAndCleanupAllData() {} @Override public void close() throws IOException {}}VoidBlobStore实现了BlobStoreService接口，它执行空操作FileSystemBlobStoreflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/FileSystemBlobStore.javapublic class FileSystemBlobStore implements BlobStoreService { private static final Logger LOG = LoggerFactory.getLogger(FileSystemBlobStore.class); /* The file system in which blobs are stored. / private final FileSystem fileSystem; /* The base path of the blob store. */ private final String basePath; public FileSystemBlobStore(FileSystem fileSystem, String storagePath) throws IOException { this.fileSystem = checkNotNull(fileSystem); this.basePath = checkNotNull(storagePath) + “/blob”; LOG.info(“Creating highly available BLOB storage directory at {}”, basePath); fileSystem.mkdirs(new Path(basePath)); LOG.debug(“Created highly available BLOB storage directory at {}”, basePath); } // - Put —————————————————————— @Override public boolean put(File localFile, JobID jobId, BlobKey blobKey) throws IOException { return put(localFile, BlobUtils.getStorageLocationPath(basePath, jobId, blobKey)); } private boolean put(File fromFile, String toBlobPath) throws IOException { try (OutputStream os = fileSystem.create(new Path(toBlobPath), FileSystem.WriteMode.OVERWRITE)) { LOG.debug(“Copying from {} to {}.”, fromFile, toBlobPath); Files.copy(fromFile, os); } return true; } // - Get —————————————————————— @Override public boolean get(JobID jobId, BlobKey blobKey, File localFile) throws IOException { return get(BlobUtils.getStorageLocationPath(basePath, jobId, blobKey), localFile, blobKey); } private boolean get(String fromBlobPath, File toFile, BlobKey blobKey) throws IOException { checkNotNull(fromBlobPath, “Blob path”); checkNotNull(toFile, “File”); checkNotNull(blobKey, “Blob key”); if (!toFile.exists() && !toFile.createNewFile()) { throw new IOException(“Failed to create target file to copy to”); } final Path fromPath = new Path(fromBlobPath); MessageDigest md = BlobUtils.createMessageDigest(); final int buffSize = 4096; // like IOUtils#BLOCKSIZE, for chunked file copying boolean success = false; try (InputStream is = fileSystem.open(fromPath); FileOutputStream fos = new FileOutputStream(toFile)) { LOG.debug(“Copying from {} to {}.”, fromBlobPath, toFile); // not using IOUtils.copyBytes(is, fos) here to be able to create a hash on-the-fly final byte[] buf = new byte[buffSize]; int bytesRead = is.read(buf); while (bytesRead >= 0) { fos.write(buf, 0, bytesRead); md.update(buf, 0, bytesRead); bytesRead = is.read(buf); } // verify that file contents are correct final byte[] computedKey = md.digest(); if (!Arrays.equals(computedKey, blobKey.getHash())) { throw new IOException(“Detected data corruption during transfer”); } success = true; } finally { // if the copy fails, we need to remove the target file because // outside code relies on a correct file as long as it exists if (!success) { try { toFile.delete(); } catch (Throwable ignored) {} } } return true; // success is always true here } // - Delete ————————————————————— @Override public boolean delete(JobID jobId, BlobKey blobKey) { return delete(BlobUtils.getStorageLocationPath(basePath, jobId, blobKey)); } @Override public boolean deleteAll(JobID jobId) { return delete(BlobUtils.getStorageLocationPath(basePath, jobId)); } private boolean delete(String blobPath) { try { LOG.debug(“Deleting {}.”, blobPath); Path path = new Path(blobPath); boolean result = fileSystem.delete(path, true); // send a call to delete the directory containing the file. This will // fail (and be ignored) when some files still exist. try { fileSystem.delete(path.getParent(), false); fileSystem.delete(new Path(basePath), false); } catch (IOException ignored) {} return result; } catch (Exception e) { LOG.warn(“Failed to delete blob at " + blobPath); return false; } } @Override public void closeAndCleanupAllData() { try { LOG.debug(“Cleaning up {}.”, basePath); fileSystem.delete(new Path(basePath), true); } catch (Exception e) { LOG.error(“Failed to clean up recovery directory.”, e); } } @Override public void close() throws IOException { // nothing to do for the FileSystemBlobStore }}FileSystemBlobStore实现了BlobStoreService，它的构造器要求传入fileSystem及storagePath；put方法通过fileSystem.create来创建目标OutputStream，然后通过Files.copy把localFile拷贝到toBlobPath；get方法通过fileSystem.open打开要读取的blob，然后写入到localFile；delete及deleteAll方法通过BlobUtils.getStorageLocationPath获取blobPath，然后调用fileSystem.delete来删除；closeAndCleanupAllData方法直接调用fileSystem.delete来递归删除整个storagePath小结BlobView定义了get方法，将指定的blob拷贝到localFile；BlobStore继承了BlobView，它定义了put、delete、deleteAll方法BlobStoreService继承了BlobStore及Closeable接口，它定义了closeAndCleanupAllData方法；它有两个实现类，分别是VoidBlobStore、FileSystemBlobStoreVoidBlobStore实现了BlobStoreService接口，它执行空操作；FileSystemBlobStore实现了BlobStoreService，它的构造器要求传入fileSystem及storagePath；put方法通过fileSystem.create来创建目标OutputStream，然后通过Files.copy把localFile拷贝到toBlobPath；get方法通过fileSystem.open打开要读取的blob，然后写入到localFile；delete及deleteAll方法通过BlobUtils.getStorageLocationPath获取blobPath，然后调用fileSystem.delete来删除；closeAndCleanupAllData方法直接调用fileSystem.delete来递归删除整个storagePathdocBlobStoreService ...

Blink 真香

Blink 开源了有一段时间了，竟然没发现有人写相关的博客，其实我已经在我的知识星球里开始写了，今天来看看 Blink 为什么香？我们先看看 Blink 黑色版本：对比下 Flink 版本你就知道黑色版本多好看了。你上传 jar 包的时候是这样的：我们来看看 Blink 运行的 job 长啥样？再来对比一下 Flink 的样子：查看 Job Task 的详情，可以看到开始时间、接收记录、并行度、duration、Queue in/out、TPS查看 subTask，这里可以直接点击这个日志就可以查看 task 日志：查看背压：查看 task metric，可以手动添加，支持的有很多，这点很重要，可以根据每个算子的监控以及时对每个算子进行调优：查看 job 运行时间段的情况：查看 running 的 job：查看已经完成的 job：查看 Task Manager：Task Manager 分配的资源详情：Task Manager metric 监控信息详情：Task Manager log 文件详情，包含运行产生的日志和 GC 日志：Task Manager 日志详情，支持高亮和分页，特别友好，妈妈再也不担心我看不见 “刷刷刷” 的日志了。总结介绍了 Flink 的 Blink 分支编译后运行的界面情况，总体来说很棒，期待后面 Blink 合并到 Flink！本文原创地址是: http://www.54tianzhisheng.cn/2019/02/28/blink/ , 未经允许禁止转载。关注我微信公众号：zhisheng另外我自己整理了些 Flink 的学习资料，目前已经全部放到微信公众号了。你可以加我的微信：zhisheng_tian，然后回复关键字：Flink 即可无条件获取到。更多私密资料请加入知识星球！Github 代码仓库https://github.com/zhisheng17/flink-learning/以后这个项目的所有代码都将放在这个仓库里，包含了自己学习 flink 的一些 demo 和博客。相关文章1、《从0到1学习Flink》—— Apache Flink 介绍2、《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门3、《从0到1学习Flink》—— Flink 配置文件详解4、《从0到1学习Flink》—— Data Source 介绍5、《从0到1学习Flink》—— 如何自定义 Data Source ？6、《从0到1学习Flink》—— Data Sink 介绍7、《从0到1学习Flink》—— 如何自定义 Data Sink ？8、《从0到1学习Flink》—— Flink Data transformation(转换)9、《从0到1学习Flink》—— 介绍Flink中的Stream Windows10、《从0到1学习Flink》—— Flink 中的几种 Time 详解11、《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch12、《从0到1学习Flink》—— Flink 项目如何运行？13、《从0到1学习Flink》—— Flink 写入数据到 Kafka14、《从0到1学习Flink》—— Flink JobManager 高可用性配置15、《从0到1学习Flink》—— Flink parallelism 和 Slot 介绍16、《从0到1学习Flink》—— Flink 读取 Kafka 数据批量写入到 MySQL ...

聊聊flink的BlobWriter

序本文主要研究一下flink的BlobWriterBlobWriterflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/BlobWriter.java/** * BlobWriter is used to upload data to the BLOB store. /public interface BlobWriter { Logger LOG = LoggerFactory.getLogger(BlobWriter.class); /* * Uploads the data of the given byte array for the given job to the BLOB server and makes it * a permanent BLOB. * * @param jobId * the ID of the job the BLOB belongs to * @param value * the buffer to upload * * @return the computed BLOB key identifying the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while writing it to a local file, or uploading it to the HA * store / PermanentBlobKey putPermanent(JobID jobId, byte[] value) throws IOException; /* * Uploads the data from the given input stream for the given job to the BLOB server and makes it * a permanent BLOB. * * @param jobId * ID of the job this blob belongs to * @param inputStream * the input stream to read the data from * * @return the computed BLOB key identifying the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while reading the data from the input stream, writing it to a * local file, or uploading it to the HA store / PermanentBlobKey putPermanent(JobID jobId, InputStream inputStream) throws IOException; /* * Returns the min size before data will be offloaded to the BLOB store. * * @return minimum offloading size / int getMinOffloadingSize(); /* * Serializes the given value and offloads it to the BlobServer if its size exceeds the minimum * offloading size of the BlobServer. * * @param value to serialize * @param jobId to which the value belongs. * @param blobWriter to use to offload the serialized value * @param <T> type of the value to serialize * @return Either the serialized value or the stored blob key * @throws IOException if the data cannot be serialized / static <T> Either<SerializedValue<T>, PermanentBlobKey> serializeAndTryOffload( T value, JobID jobId, BlobWriter blobWriter) throws IOException { Preconditions.checkNotNull(value); Preconditions.checkNotNull(jobId); Preconditions.checkNotNull(blobWriter); final SerializedValue<T> serializedValue = new SerializedValue<>(value); if (serializedValue.getByteArray().length < blobWriter.getMinOffloadingSize()) { return Either.Left(new SerializedValue<>(value)); } else { try { final PermanentBlobKey permanentBlobKey = blobWriter.putPermanent(jobId, serializedValue.getByteArray()); return Either.Right(permanentBlobKey); } catch (IOException e) { LOG.warn(“Failed to offload value {} for job {} to BLOB store.”, value, jobId, e); return Either.Left(serializedValue); } } }}BlobWriter定义了putPermanent、getMinOffloadingSize方法，同时还提供了serializeAndTryOffload静态方法用于序列化指定value并在其大小超过minimum offloading size时调用blobWriter.putPermanent存放到BlobServerBlobServerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/BlobServer.java/* * This class implements the BLOB server. The BLOB server is responsible for listening for incoming requests and * spawning threads to handle these requests. Furthermore, it takes care of creating the directory structure to store * the BLOBs or temporarily cache them. /public class BlobServer extends Thread implements BlobService, BlobWriter, PermanentBlobService, TransientBlobService { //…… @Override public PermanentBlobKey putPermanent(JobID jobId, byte[] value) throws IOException { checkNotNull(jobId); return (PermanentBlobKey) putBuffer(jobId, value, PERMANENT_BLOB); } @Override public PermanentBlobKey putPermanent(JobID jobId, InputStream inputStream) throws IOException { checkNotNull(jobId); return (PermanentBlobKey) putInputStream(jobId, inputStream, PERMANENT_BLOB); } /* * Returns the configuration used by the BLOB server. * * @return configuration / @Override public final int getMinOffloadingSize() { return blobServiceConfiguration.getInteger(BlobServerOptions.OFFLOAD_MINSIZE); } /* * Uploads the data of the given byte array for the given job to the BLOB server. * * @param jobId * the ID of the job the BLOB belongs to * @param value * the buffer to upload * @param blobType * whether to make the data permanent or transient * * @return the computed BLOB key identifying the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while writing it to a local file, or uploading it to the HA * store / private BlobKey putBuffer(@Nullable JobID jobId, byte[] value, BlobKey.BlobType blobType) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug(“Received PUT call for BLOB of job {}.”, jobId); } File incomingFile = createTemporaryFilename(); MessageDigest md = BlobUtils.createMessageDigest(); BlobKey blobKey = null; try (FileOutputStream fos = new FileOutputStream(incomingFile)) { md.update(value); fos.write(value); } catch (IOException ioe) { // delete incomingFile from a failed download if (!incomingFile.delete() && incomingFile.exists()) { LOG.warn(“Could not delete the staging file {} for job {}.”, incomingFile, jobId); } throw ioe; } try { // persist file blobKey = moveTempFileToStore(incomingFile, jobId, md.digest(), blobType); return blobKey; } finally { // delete incomingFile from a failed download if (!incomingFile.delete() && incomingFile.exists()) { LOG.warn(“Could not delete the staging file {} for blob key {} and job {}.”, incomingFile, blobKey, jobId); } } } /* * Uploads the data from the given input stream for the given job to the BLOB server. * * @param jobId * the ID of the job the BLOB belongs to * @param inputStream * the input stream to read the data from * @param blobType * whether to make the data permanent or transient * * @return the computed BLOB key identifying the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while reading the data from the input stream, writing it to a * local file, or uploading it to the HA store / private BlobKey putInputStream( @Nullable JobID jobId, InputStream inputStream, BlobKey.BlobType blobType) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug(“Received PUT call for BLOB of job {}.”, jobId); } File incomingFile = createTemporaryFilename(); MessageDigest md = BlobUtils.createMessageDigest(); BlobKey blobKey = null; try (FileOutputStream fos = new FileOutputStream(incomingFile)) { // read stream byte[] buf = new byte[BUFFER_SIZE]; while (true) { final int bytesRead = inputStream.read(buf); if (bytesRead == -1) { // done break; } fos.write(buf, 0, bytesRead); md.update(buf, 0, bytesRead); } // persist file blobKey = moveTempFileToStore(incomingFile, jobId, md.digest(), blobType); return blobKey; } finally { // delete incomingFile from a failed download if (!incomingFile.delete() && incomingFile.exists()) { LOG.warn(“Could not delete the staging file {} for blob key {} and job {}.”, incomingFile, blobKey, jobId); } } } /* * Moves the temporary <tt>incomingFile</tt> to its permanent location where it is available for * use. * * @param incomingFile * temporary file created during transfer * @param jobId * ID of the job this blob belongs to or <tt>null</tt> if job-unrelated * @param digest * BLOB content digest, i.e. hash * @param blobType * whether this file is a permanent or transient BLOB * * @return unique BLOB key that identifies the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while moving the file or uploading it to the HA store / BlobKey moveTempFileToStore( File incomingFile, @Nullable JobID jobId, byte[] digest, BlobKey.BlobType blobType) throws IOException { int retries = 10; int attempt = 0; while (true) { // add unique component independent of the BLOB content BlobKey blobKey = BlobKey.createKey(blobType, digest); File storageFile = BlobUtils.getStorageLocation(storageDir, jobId, blobKey); // try again until the key is unique (put the existence check into the lock!) readWriteLock.writeLock().lock(); try { if (!storageFile.exists()) { BlobUtils.moveTempFileToStore( incomingFile, jobId, blobKey, storageFile, LOG, blobKey instanceof PermanentBlobKey ? blobStore : null); // add TTL for transient BLOBs: if (blobKey instanceof TransientBlobKey) { // must be inside read or write lock to add a TTL blobExpiryTimes .put(Tuple2.of(jobId, (TransientBlobKey) blobKey), System.currentTimeMillis() + cleanupInterval); } return blobKey; } } finally { readWriteLock.writeLock().unlock(); } ++attempt; if (attempt >= retries) { String message = “Failed to find a unique key for BLOB of job " + jobId + " (last tried " + storageFile.getAbsolutePath() + “.”; LOG.error(message + " No retries left.”); throw new IOException(message); } else { if (LOG.isDebugEnabled()) { LOG.debug(“Trying to find a unique key for BLOB of job {} (retry {}, last tried {})”, jobId, attempt, storageFile.getAbsolutePath()); } } } } /* * Returns a temporary file inside the BLOB server’s incoming directory. * * @return a temporary file inside the BLOB server’s incoming directory * * @throws IOException * if creating the directory fails / File createTemporaryFilename() throws IOException { return new File(BlobUtils.getIncomingDirectory(storageDir), String.format(“temp-%08d”, tempFileCounter.getAndIncrement())); } //……}BlobServer实现了BlobWriter接口，putPermanent方法分别用到了putBuffer及putInputStream方法，而getMinOffloadingSize方法则从blobServiceConfiguration获取BlobServerOptions.OFFLOAD_MINSIZE配置，默认是1MputBuffer方法接收byte[]参数，它先把byte[]写入到临时文件，之后调用moveTempFileToStore方法进行持久化；putInputStream方法接收InputStream参数，它也是先把InputStream写入到临时文件，然后调用moveTempFileToStore方法进行持久化moveTempFileToStore方法调用了BlobUtils.moveTempFileToStore将本地临时文件转移到permanent location；其中storageDir由BlobUtils.initLocalStorageDirectory(config)来初始化，而storageFile通过BlobUtils.getStorageLocation(storageDir, jobId, blobKey)来获取BlobUtilsflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/BlobUtils.java/* * Utility class to work with blob data. /public class BlobUtils { //…… /* * Creates a local storage directory for a blob service under the configuration parameter given * by {@link BlobServerOptions#STORAGE_DIRECTORY}. If this is <tt>null</tt> or empty, we will * fall back to Flink’s temp directories (given by * {@link org.apache.flink.configuration.CoreOptions#TMP_DIRS}) and choose one among them at * random. * * @param config * Flink configuration * * @return a new local storage directory * * @throws IOException * thrown if the local file storage cannot be created or is not usable / static File initLocalStorageDirectory(Configuration config) throws IOException { String basePath = config.getString(BlobServerOptions.STORAGE_DIRECTORY); File baseDir; if (StringUtils.isNullOrWhitespaceOnly(basePath)) { final String[] tmpDirPaths = ConfigurationUtils.parseTempDirectories(config); baseDir = new File(tmpDirPaths[RANDOM.nextInt(tmpDirPaths.length)]); } else { baseDir = new File(basePath); } File storageDir; // NOTE: although we will be using UUIDs, there may be collisions int maxAttempts = 10; for (int attempt = 0; attempt < maxAttempts; attempt++) { storageDir = new File(baseDir, String.format( “blobStore-%s”, UUID.randomUUID().toString())); // Create the storage dir if it doesn’t exist. Only return it when the operation was // successful. if (storageDir.mkdirs()) { return storageDir; } } // max attempts exceeded to find a storage directory throw new IOException(“Could not create storage directory for BLOB store in ‘” + baseDir + “’.”); } /* * Returns the (designated) physical storage location of the BLOB with the given key. * * @param storageDir * storage directory used be the BLOB service * @param key * the key identifying the BLOB * @param jobId * ID of the job for the incoming files (or <tt>null</tt> if job-unrelated) * * @return the (designated) physical storage location of the BLOB * * @throws IOException * if creating the directory fails / static File getStorageLocation( File storageDir, @Nullable JobID jobId, BlobKey key) throws IOException { File file = new File(getStorageLocationPath(storageDir.getAbsolutePath(), jobId, key)); Files.createDirectories(file.getParentFile().toPath()); return file; } /* * Returns the path for the given blob key. * * The returned path can be used with the (local or HA) BLOB store file system back-end for * recovery purposes and follows the same scheme as {@link #getStorageLocation(File, JobID, * BlobKey)}. * * @param storageDir * storage directory used be the BLOB service * @param key * the key identifying the BLOB * @param jobId * ID of the job for the incoming files * * @return the path to the given BLOB / static String getStorageLocationPath( String storageDir, @Nullable JobID jobId, BlobKey key) { if (jobId == null) { // format: $base/no_job/blob_$key return String.format("%s/%s/%s%s", storageDir, NO_JOB_DIR_PREFIX, BLOB_FILE_PREFIX, key.toString()); } else { // format: $base/job_$jobId/blob_$key return String.format("%s/%s%s/%s%s", storageDir, JOB_DIR_PREFIX, jobId.toString(), BLOB_FILE_PREFIX, key.toString()); } } /* * Moves the temporary <tt>incomingFile</tt> to its permanent location where it is available for * use (not thread-safe!). * * @param incomingFile * temporary file created during transfer * @param jobId * ID of the job this blob belongs to or <tt>null</tt> if job-unrelated * @param blobKey * BLOB key identifying the file * @param storageFile * (local) file where the blob is/should be stored * @param log * logger for debug information * @param blobStore * HA store (or <tt>null</tt> if unavailable) * * @throws IOException * thrown if an I/O error occurs while moving the file or uploading it to the HA store */ static void moveTempFileToStore( File incomingFile, @Nullable JobID jobId, BlobKey blobKey, File storageFile, Logger log, @Nullable BlobStore blobStore) throws IOException { try { // first check whether the file already exists if (!storageFile.exists()) { try { // only move the file if it does not yet exist Files.move(incomingFile.toPath(), storageFile.toPath()); incomingFile = null; } catch (FileAlreadyExistsException ignored) { log.warn(“Detected concurrent file modifications. This should only happen if multiple” + “BlobServer use the same storage directory.”); // we cannot be sure at this point whether the file has already been uploaded to the blob // store or not. Even if the blobStore might shortly be in an inconsistent state, we have // to persist the blob. Otherwise we might not be able to recover the job. } if (blobStore != null) { // only the one moving the incoming file to its final destination is allowed to upload the // file to the blob store blobStore.put(storageFile, jobId, blobKey); } } else { log.warn(“File upload for an existing file with key {} for job {}. This may indicate a duplicate upload or a hash collision. Ignoring newest upload.”, blobKey, jobId); } storageFile = null; } finally { // we failed to either create the local storage file or to upload it –> try to delete the local file // while still having the write lock if (storageFile != null && !storageFile.delete() && storageFile.exists()) { log.warn(“Could not delete the storage file {}.”, storageFile); } if (incomingFile != null && !incomingFile.delete() && incomingFile.exists()) { log.warn(“Could not delete the staging file {} for blob key {} and job {}.”, incomingFile, blobKey, jobId); } } } //……}initLocalStorageDirectory方法从配置文件读取BlobServerOptions.STORAGE_DIRECTORY配置(blob.storage.directory)，如果没有配置，则通过ConfigurationUtils.parseTempDirectories来获取tmpDirPaths，然后随机选一个作为baseDir，而storageDir目录则是baseDir的子目录，其目录名前缀为blobStoregetStorageLocation方法则在storageDir的基础上根据JobID及BlobKey构造具体的存储路径，其格式为$base/no_job/blob_$key或者$base/job_$jobId/blob_$keymoveTempFileToStore方法则在目标文件不存在的场景下使用Files.move将incomingFile转移到storageFile，如果blobStore不为null，还会将storageFile放入到BlobStore小结BlobWriter定义了putPermanent、getMinOffloadingSize方法，同时还提供了serializeAndTryOffload静态方法用于序列化指定value并在其大小超过minimum offloading size时调用blobWriter.putPermanent存放到BlobServerBlobServer实现了BlobWriter接口，putPermanent方法分别用到了putBuffer及putInputStream方法，而getMinOffloadingSize方法则从blobServiceConfiguration获取BlobServerOptions.OFFLOAD_MINSIZE配置，默认是1M；putBuffer方法接收byte[]参数，它先把byte[]写入到临时文件，之后调用moveTempFileToStore方法进行持久化；putInputStream方法接收InputStream参数，它也是先把InputStream写入到临时文件，然后调用moveTempFileToStore方法进行持久化；moveTempFileToStore方法调用了BlobUtils.moveTempFileToStore将本地临时文件转移到permanent location；其中storageDir由BlobUtils.initLocalStorageDirectory(config)来初始化，而storageFile通过BlobUtils.getStorageLocation(storageDir, jobId, blobKey)来获取BlobUtils的initLocalStorageDirectory方法从配置文件读取BlobServerOptions.STORAGE_DIRECTORY配置(blob.storage.directory)，如果没有配置，则通过ConfigurationUtils.parseTempDirectories来获取tmpDirPaths，然后随机选一个作为baseDir，而storageDir目录则是baseDir的子目录，其目录名前缀为blobStore；getStorageLocation方法则在storageDir的基础上根据JobID及BlobKey构造具体的存储路径，其格式为$base/no_job/blob_$key或者$base/job_$jobId/blob_$key；moveTempFileToStore方法则在目标文件不存在的场景下使用Files.move将incomingFile转移到storageFile，如果blobStore不为null，还会将storageFile放入到BlobStoredocBlobWriter ...

聊聊flink的BlobService

序本文主要研究一下flink的BlobServiceBlobServiceflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/BlobService.java/** * A simple store and retrieve binary large objects (BLOBs). /public interface BlobService extends Closeable { /* * Returns a BLOB service for accessing permanent BLOBs. * * @return BLOB service / PermanentBlobService getPermanentBlobService(); /* * Returns a BLOB service for accessing transient BLOBs. * * @return BLOB service / TransientBlobService getTransientBlobService(); /* * Returns the port of the BLOB server that this BLOB service is working with. * * @return the port the blob server. / int getPort();}BlobService定义了getPermanentBlobService方法用于获取PermanentBlobService；getTransientBlobService方法用于获取TransientBlobServicePermanentBlobServiceflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/PermanentBlobService.java/* * A service to retrieve permanent binary large objects (BLOBs). * * These may include per-job BLOBs that are covered by high-availability (HA) mode, e.g. a job’s * JAR files or (parts of) an off-loaded {@link org.apache.flink.runtime.deployment.TaskDeploymentDescriptor} * or files in the {@link org.apache.flink.api.common.cache.DistributedCache}. /public interface PermanentBlobService extends Closeable { /* * Returns the path to a local copy of the file associated with the provided job ID and blob * key. * * @param jobId * ID of the job this blob belongs to * @param key * BLOB key associated with the requested file * * @return The path to the file. * * @throws java.io.FileNotFoundException * if the BLOB does not exist; * @throws IOException * if any other error occurs when retrieving the file / File getFile(JobID jobId, PermanentBlobKey key) throws IOException;}PermanentBlobService提供了getFile方法，它根据JobID及PermanentBlobKey来获取FileTransientBlobServiceflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/TransientBlobService.java/* * A service to retrieve transient binary large objects (BLOBs) which are deleted on the * {@link BlobServer} when they are retrieved. * * These may include per-job BLOBs like files in the {@link * org.apache.flink.api.common.cache.DistributedCache}, for example. * * Note: None of these BLOBs is highly available (HA). This case is covered by BLOBs in the * {@link PermanentBlobService}. * * TODO: change API to not rely on local files but return {@link InputStream} objects /public interface TransientBlobService extends Closeable { // ——————————————————————————————– // GET // ——————————————————————————————– /* * Returns the path to a local copy of the (job-unrelated) file associated with the provided * blob key. * * @param key * blob key associated with the requested file * * @return The path to the file. * * @throws java.io.FileNotFoundException * when the path does not exist; * @throws IOException * if any other error occurs when retrieving the file / File getFile(TransientBlobKey key) throws IOException; /* * Returns the path to a local copy of the file associated with the provided job ID and blob * key. * * @param jobId * ID of the job this blob belongs to * @param key * blob key associated with the requested file * * @return The path to the file. * * @throws java.io.FileNotFoundException * when the path does not exist; * @throws IOException * if any other error occurs when retrieving the file / File getFile(JobID jobId, TransientBlobKey key) throws IOException; // ——————————————————————————————– // PUT // ——————————————————————————————– /* * Uploads the (job-unrelated) data of the given byte array to the BLOB server. * * @param value * the buffer to upload * * @return the computed BLOB key identifying the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while uploading the data to the BLOB server / TransientBlobKey putTransient(byte[] value) throws IOException; /* * Uploads the data of the given byte array for the given job to the BLOB server. * * @param jobId * the ID of the job the BLOB belongs to * @param value * the buffer to upload * * @return the computed BLOB key identifying the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while uploading the data to the BLOB server / TransientBlobKey putTransient(JobID jobId, byte[] value) throws IOException; /* * Uploads the (job-unrelated) data from the given input stream to the BLOB server. * * @param inputStream * the input stream to read the data from * * @return the computed BLOB key identifying the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while reading the data from the input stream or uploading the * data to the BLOB server / TransientBlobKey putTransient(InputStream inputStream) throws IOException; /* * Uploads the data from the given input stream for the given job to the BLOB server. * * @param jobId * ID of the job this blob belongs to * @param inputStream * the input stream to read the data from * * @return the computed BLOB key identifying the BLOB on the server * * @throws IOException * thrown if an I/O error occurs while reading the data from the input stream or uploading the * data to the BLOB server / TransientBlobKey putTransient(JobID jobId, InputStream inputStream) throws IOException; // ——————————————————————————————– // DELETE // ——————————————————————————————– /* * Deletes the (job-unrelated) file associated with the provided blob key from the local cache. * * @param key * associated with the file to be deleted * * @return <tt>true</tt> if the given blob is successfully deleted or non-existing; * <tt>false</tt> otherwise / boolean deleteFromCache(TransientBlobKey key); /* * Deletes the file associated with the provided job ID and blob key from the local cache. * * @param jobId * ID of the job this blob belongs to * @param key * associated with the file to be deleted * * @return <tt>true</tt> if the given blob is successfully deleted or non-existing; * <tt>false</tt> otherwise / boolean deleteFromCache(JobID jobId, TransientBlobKey key);}TransientBlobService用于获取transient binary large objects (BLOBs)，这些blobs在获取时就会在BlobServer上删掉；它提供了getFile、putTransient、deleteFromCache方法BlobKeyflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/BlobKey.java/* * A BLOB key uniquely identifies a BLOB. /public abstract class BlobKey implements Serializable, Comparable<BlobKey> { private static final long serialVersionUID = 3847117712521785209L; /* Size of the internal BLOB key in bytes. / public static final int SIZE = 20; /* The byte buffer storing the actual key data. / private final byte[] key; /* * (Internal) BLOB type - to be reflected by the inheriting sub-class. / private final BlobType type; /* * BLOB type, i.e. permanent or transient. / enum BlobType { /* * Indicates a permanent BLOB whose lifecycle is that of a job and which is made highly * available. / PERMANENT_BLOB, /* * Indicates a transient BLOB whose lifecycle is managed by the user and which is not made * highly available. / TRANSIENT_BLOB } /* * Random component of the key. / private final AbstractID random; /* * Constructs a new BLOB key. * * @param type * whether the referenced BLOB is permanent or transient / protected BlobKey(BlobType type) { this.type = checkNotNull(type); this.key = new byte[SIZE]; this.random = new AbstractID(); } /* * Constructs a new BLOB key from the given byte array. * * @param type * whether the referenced BLOB is permanent or transient * @param key * the actual key data / protected BlobKey(BlobType type, byte[] key) { if (key == null || key.length != SIZE) { throw new IllegalArgumentException(“BLOB key must have a size of " + SIZE + " bytes”); } this.type = checkNotNull(type); this.key = key; this.random = new AbstractID(); } /* * Constructs a new BLOB key from the given byte array. * * @param type * whether the referenced BLOB is permanent or transient * @param key * the actual key data * @param random * the random component of the key / protected BlobKey(BlobType type, byte[] key, byte[] random) { if (key == null || key.length != SIZE) { throw new IllegalArgumentException(“BLOB key must have a size of " + SIZE + " bytes”); } this.type = checkNotNull(type); this.key = key; this.random = new AbstractID(random); } /* * Returns the right {@link BlobKey} subclass for the given parameters. * * @param type * whether the referenced BLOB is permanent or transient * * @return BlobKey subclass / @VisibleForTesting static BlobKey createKey(BlobType type) { if (type == PERMANENT_BLOB) { return new PermanentBlobKey(); } else { return new TransientBlobKey(); } } /* * Returns the right {@link BlobKey} subclass for the given parameters. * * @param type * whether the referenced BLOB is permanent or transient * @param key * the actual key data * * @return BlobKey subclass / static BlobKey createKey(BlobType type, byte[] key) { if (type == PERMANENT_BLOB) { return new PermanentBlobKey(key); } else { return new TransientBlobKey(key); } } /* * Returns the right {@link BlobKey} subclass for the given parameters. * * @param type * whether the referenced BLOB is permanent or transient * @param key * the actual key data * @param random * the random component of the key * * @return BlobKey subclass / static BlobKey createKey(BlobType type, byte[] key, byte[] random) { if (type == PERMANENT_BLOB) { return new PermanentBlobKey(key, random); } else { return new TransientBlobKey(key, random); } } /* * Returns the hash component of this key. * * @return a 20 bit hash of the contents the key refers to / @VisibleForTesting public byte[] getHash() { return key; } /* * Returns the (internal) BLOB type which is reflected by the inheriting sub-class. * * @return BLOB type, i.e. permanent or transient / BlobType getType() { return type; } /* * Adds the BLOB key to the given {@link MessageDigest}. * * @param md * the message digest to add the BLOB key to / public void addToMessageDigest(MessageDigest md) { md.update(this.key); } @Override public boolean equals(final Object obj) { if (!(obj instanceof BlobKey)) { return false; } final BlobKey bk = (BlobKey) obj; return Arrays.equals(this.key, bk.key) && this.type == bk.type && this.random.equals(bk.random); } @Override public int hashCode() { int result = Arrays.hashCode(this.key); result = 37 * result + this.type.hashCode(); result = 37 * result + this.random.hashCode(); return result; } @Override public String toString() { final String typeString; switch (this.type) { case TRANSIENT_BLOB: typeString = “t-”; break; case PERMANENT_BLOB: typeString = “p-”; break; default: // this actually never happens! throw new IllegalStateException(“Invalid BLOB type”); } return typeString + StringUtils.byteToHexString(this.key) + “-” + random.toString(); } @Override public int compareTo(BlobKey o) { // compare the hashes first final byte[] aarr = this.key; final byte[] barr = o.key; final int len = Math.min(aarr.length, barr.length); for (int i = 0; i < len; ++i) { final int a = (aarr[i] & 0xff); final int b = (barr[i] & 0xff); if (a != b) { return a - b; } } if (aarr.length == barr.length) { // same hash contents - compare the BLOB types int typeCompare = this.type.compareTo(o.type); if (typeCompare == 0) { // same type - compare random components return this.random.compareTo(o.random); } else { return typeCompare; } } else { return aarr.length - barr.length; } } // ——————————————————————————————– /* * Auxiliary method to read a BLOB key from an input stream. * * @param inputStream * the input stream to read the BLOB key from * @return the read BLOB key * @throws IOException * throw if an I/O error occurs while reading from the input stream / static BlobKey readFromInputStream(InputStream inputStream) throws IOException { final byte[] key = new byte[BlobKey.SIZE]; final byte[] random = new byte[AbstractID.SIZE]; int bytesRead = 0; // read key while (bytesRead < key.length) { final int read = inputStream.read(key, bytesRead, key.length - bytesRead); if (read < 0) { throw new EOFException(“Read an incomplete BLOB key”); } bytesRead += read; } // read BLOB type final BlobType blobType; { final int read = inputStream.read(); if (read < 0) { throw new EOFException(“Read an incomplete BLOB type”); } else if (read == TRANSIENT_BLOB.ordinal()) { blobType = TRANSIENT_BLOB; } else if (read == PERMANENT_BLOB.ordinal()) { blobType = PERMANENT_BLOB; } else { throw new IOException(“Invalid data received for the BLOB type: " + read); } } // read random component bytesRead = 0; while (bytesRead < AbstractID.SIZE) { final int read = inputStream.read(random, bytesRead, AbstractID.SIZE - bytesRead); if (read < 0) { throw new EOFException(“Read an incomplete BLOB key”); } bytesRead += read; } return createKey(blobType, key, random); } /* * Auxiliary method to write this BLOB key to an output stream. * * @param outputStream * the output stream to write the BLOB key to * @throws IOException * thrown if an I/O error occurs while writing the BLOB key / void writeToOutputStream(final OutputStream outputStream) throws IOException { outputStream.write(this.key); outputStream.write(this.type.ordinal()); outputStream.write(this.random.getBytes()); }}BlobKey是个抽象类，它有key、BlobType、AbstractID三个属性，其中BlobType分为PERMANENT_BLOB及TRANSIENT_BLOB；它定义了createKey静态方法，用于根据BlobType创建BlobKey；readFromInputStream方法用于从InputStream反序列化为BlobKey；writeToOutputStream方法用于将BlobKey序列化到OutputStream；它有两个子类，分别为PermanentBlobKey及TransientBlobKeyPermanentBlobKeyflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/PermanentBlobKey.java/* * BLOB key referencing permanent BLOB files. /public final class PermanentBlobKey extends BlobKey { /* * Constructs a new BLOB key. / @VisibleForTesting public PermanentBlobKey() { super(BlobType.PERMANENT_BLOB); } /* * Constructs a new BLOB key from the given byte array. * * @param key * the actual key data / PermanentBlobKey(byte[] key) { super(BlobType.PERMANENT_BLOB, key); } /* * Constructs a new BLOB key from the given byte array. * * @param key * the actual key data * @param random * the random component of the key / PermanentBlobKey(byte[] key, byte[] random) { super(BlobType.PERMANENT_BLOB, key, random); }}PermanentBlobKey继承了BlobKey，它的BlobType为BlobType.PERMANENT_BLOBTransientBlobKeyflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/blob/TransientBlobKey.java/* * BLOB key referencing transient BLOB files. /public final class TransientBlobKey extends BlobKey { /* * Constructs a new BLOB key. / @VisibleForTesting public TransientBlobKey() { super(BlobType.TRANSIENT_BLOB); } /* * Constructs a new BLOB key from the given byte array. * * @param key * the actual key data / TransientBlobKey(byte[] key) { super(BlobType.TRANSIENT_BLOB, key); } /* * Constructs a new BLOB key from the given byte array. * * @param key * the actual key data * @param random * the random component of the key */ TransientBlobKey(byte[] key, byte[] random) { super(BlobType.TRANSIENT_BLOB, key, random); }}TransientBlobKey继承了BlobKey，它的BlobType为BlobType.TRANSIENT_BLOB小结BlobService定义了getPermanentBlobService方法用于获取PermanentBlobService；getTransientBlobService方法用于获取TransientBlobServicePermanentBlobService提供了getFile方法，它根据JobID及PermanentBlobKey来获取File；TransientBlobService用于获取transient binary large objects (BLOBs)，这些blobs在获取时就会在BlobServer上删掉；它提供了getFile、putTransient、deleteFromCache方法BlobKey是个抽象类，它有key、BlobType、AbstractID三个属性，其中BlobType分为PERMANENT_BLOB及TRANSIENT_BLOB；它定义了createKey静态方法，用于根据BlobType创建BlobKey；readFromInputStream方法用于从InputStream反序列化为BlobKey；writeToOutputStream方法用于将BlobKey序列化到OutputStream；它有两个子类，分别为PermanentBlobKey及TransientBlobKey；PermanentBlobKey继承了BlobKey，它的BlobType为BlobType.PERMANENT_BLOB；TransientBlobKey继承了BlobKey，它的BlobType为BlobType.TRANSIENT_BLOBdocBlob Server ...

聊聊flink的ConnectionManager

序本文主要研究一下flink的ConnectionManagerConnectionManagerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/ConnectionManager.javapublic interface ConnectionManager { void start(ResultPartitionProvider partitionProvider, TaskEventDispatcher taskEventDispatcher) throws IOException; /** * Creates a {@link PartitionRequestClient} instance for the given {@link ConnectionID}. / PartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) throws IOException, InterruptedException; /* * Closes opened ChannelConnections in case of a resource release. / void closeOpenChannelConnections(ConnectionID connectionId); int getNumberOfActiveConnections(); int getDataPort(); void shutdown() throws IOException;}ConnectionManager定义了start、shutdown、closeOpenChannelConnections等方法用于管理physical connections；它有两个子类，一个是LocalConnectionManager，一个是NettyConnectionManagerLocalConnectionManagerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/LocalConnectionManager.javapublic class LocalConnectionManager implements ConnectionManager { @Override public void start(ResultPartitionProvider partitionProvider, TaskEventDispatcher taskEventDispatcher) { } @Override public PartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) { return null; } @Override public void closeOpenChannelConnections(ConnectionID connectionId) {} @Override public int getNumberOfActiveConnections() { return 0; } @Override public int getDataPort() { return -1; } @Override public void shutdown() {}}LocalConnectionManager实现了ConnectionManager接口，不过它的实现基本是空操作NettyConnectionManagerflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/netty/NettyConnectionManager.javapublic class NettyConnectionManager implements ConnectionManager { private final NettyServer server; private final NettyClient client; private final NettyBufferPool bufferPool; private final PartitionRequestClientFactory partitionRequestClientFactory; public NettyConnectionManager(NettyConfig nettyConfig) { this.server = new NettyServer(nettyConfig); this.client = new NettyClient(nettyConfig); this.bufferPool = new NettyBufferPool(nettyConfig.getNumberOfArenas()); this.partitionRequestClientFactory = new PartitionRequestClientFactory(client); } @Override public void start(ResultPartitionProvider partitionProvider, TaskEventDispatcher taskEventDispatcher) throws IOException { NettyProtocol partitionRequestProtocol = new NettyProtocol( partitionProvider, taskEventDispatcher, client.getConfig().isCreditBasedEnabled()); client.init(partitionRequestProtocol, bufferPool); server.init(partitionRequestProtocol, bufferPool); } @Override public PartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) throws IOException, InterruptedException { return partitionRequestClientFactory.createPartitionRequestClient(connectionId); } @Override public void closeOpenChannelConnections(ConnectionID connectionId) { partitionRequestClientFactory.closeOpenChannelConnections(connectionId); } @Override public int getNumberOfActiveConnections() { return partitionRequestClientFactory.getNumberOfActiveClients(); } @Override public int getDataPort() { if (server != null && server.getLocalAddress() != null) { return server.getLocalAddress().getPort(); } else { return -1; } } @Override public void shutdown() { client.shutdown(); server.shutdown(); } NettyClient getClient() { return client; } NettyServer getServer() { return server; } NettyBufferPool getBufferPool() { return bufferPool; }}NettyConnectionManager实现了ConnectionManager接口；它的构造器使用NettyConfig创建了NettyServer、NettyClient、NettyBufferPool，同时使用NettyClient创建了PartitionRequestClientFactorystart方法创建了NettyProtocol，同时初始化NettyClient、NettyServer；shutdown方法则关闭NettyClient、NettyServer；closeOpenChannelConnections则是使用partitionRequestClientFactory.closeOpenChannelConnections来关闭指定的connectionIdcreatePartitionRequestClient方法通过partitionRequestClientFactory.createPartitionRequestClient来创建PartitionRequestClientPartitionRequestClientFactoryflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/netty/PartitionRequestClientFactory.javaclass PartitionRequestClientFactory { private final NettyClient nettyClient; private final ConcurrentMap<ConnectionID, Object> clients = new ConcurrentHashMap<ConnectionID, Object>(); PartitionRequestClientFactory(NettyClient nettyClient) { this.nettyClient = nettyClient; } /* * Atomically establishes a TCP connection to the given remote address and * creates a {@link PartitionRequestClient} instance for this connection. / PartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) throws IOException, InterruptedException { Object entry; PartitionRequestClient client = null; while (client == null) { entry = clients.get(connectionId); if (entry != null) { // Existing channel or connecting channel if (entry instanceof PartitionRequestClient) { client = (PartitionRequestClient) entry; } else { ConnectingChannel future = (ConnectingChannel) entry; client = future.waitForChannel(); clients.replace(connectionId, future, client); } } else { // No channel yet. Create one, but watch out for a race. // We create a “connecting future” and atomically add it to the map. // Only the thread that really added it establishes the channel. // The others need to wait on that original establisher’s future. ConnectingChannel connectingChannel = new ConnectingChannel(connectionId, this); Object old = clients.putIfAbsent(connectionId, connectingChannel); if (old == null) { nettyClient.connect(connectionId.getAddress()).addListener(connectingChannel); client = connectingChannel.waitForChannel(); clients.replace(connectionId, connectingChannel, client); } else if (old instanceof ConnectingChannel) { client = ((ConnectingChannel) old).waitForChannel(); clients.replace(connectionId, old, client); } else { client = (PartitionRequestClient) old; } } // Make sure to increment the reference count before handing a client // out to ensure correct bookkeeping for channel closing. if (!client.incrementReferenceCounter()) { destroyPartitionRequestClient(connectionId, client); client = null; } } return client; } public void closeOpenChannelConnections(ConnectionID connectionId) { Object entry = clients.get(connectionId); if (entry instanceof ConnectingChannel) { ConnectingChannel channel = (ConnectingChannel) entry; if (channel.dispose()) { clients.remove(connectionId, channel); } } } int getNumberOfActiveClients() { return clients.size(); } /* * Removes the client for the given {@link ConnectionID}. */ void destroyPartitionRequestClient(ConnectionID connectionId, PartitionRequestClient client) { clients.remove(connectionId, client); } //……}PartitionRequestClientFactory的构造器需要一个NettyClient；它使用ConcurrentHashMap在内存维护了一个ConnectionID与PartitionRequestClient或ConnectingChannel的映射关系createPartitionRequestClient方法会先从ConcurrentHashMap查找是否有对应ConnectionID的PartitionRequestClient或ConnectingChannel，如果存在且是PartitionRequestClient实例则返回，如果存在且是ConnectingChannel实例则调用ConnectingChannel.waitForChannel等待PartitionRequestClient，然后替换对应ConnectionID在ConcurrentHashMap的值为PartitionRequestClient；如果ConcurrentHashMap没有对应ConnectionID的值，则会创建一个ConnectingChannel，然后放入到ConcurrentHashMap中，同时获取old object，如果old为null，则使用nettyClient.connect进行连接，然后获取PartitionRequestClient，之后替换ConcurrentHashMap中的值；如果old是ConnectingChannel则调用ConnectingChannel.waitForChannel等待PartitionRequestClient，然后替换ConcurrentHashMap中的值；在返回PartitionRequestClient之前会通过client.incrementReferenceCounter()来递增引用，如果递增不成功则调用destroyPartitionRequestClient，返回null，递增成功则返回PartitionRequestClientcloseOpenChannelConnections方法则判断，如果是ConnectingChannel，则调用ConnectingChannel.dispose，成功之后从ConcurrentHashMap中移除ConnectingChannelflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/netty/PartitionRequestClientFactory.java private static final class ConnectingChannel implements ChannelFutureListener { private final Object connectLock = new Object(); private final ConnectionID connectionId; private final PartitionRequestClientFactory clientFactory; private boolean disposeRequestClient = false; public ConnectingChannel(ConnectionID connectionId, PartitionRequestClientFactory clientFactory) { this.connectionId = connectionId; this.clientFactory = clientFactory; } private boolean dispose() { boolean result; synchronized (connectLock) { if (partitionRequestClient != null) { result = partitionRequestClient.disposeIfNotUsed(); } else { disposeRequestClient = true; result = true; } connectLock.notifyAll(); } return result; } private void handInChannel(Channel channel) { synchronized (connectLock) { try { NetworkClientHandler clientHandler = channel.pipeline().get(NetworkClientHandler.class); partitionRequestClient = new PartitionRequestClient( channel, clientHandler, connectionId, clientFactory); if (disposeRequestClient) { partitionRequestClient.disposeIfNotUsed(); } connectLock.notifyAll(); } catch (Throwable t) { notifyOfError(t); } } } private volatile PartitionRequestClient partitionRequestClient; private volatile Throwable error; private PartitionRequestClient waitForChannel() throws IOException, InterruptedException { synchronized (connectLock) { while (error == null && partitionRequestClient == null) { connectLock.wait(2000); } } if (error != null) { throw new IOException(“Connecting the channel failed: " + error.getMessage(), error); } return partitionRequestClient; } private void notifyOfError(Throwable error) { synchronized (connectLock) { this.error = error; connectLock.notifyAll(); } } @Override public void operationComplete(ChannelFuture future) throws Exception { if (future.isSuccess()) { handInChannel(future.channel()); } else if (future.cause() != null) { notifyOfError(new RemoteTransportException( “Connecting to remote task manager + ‘” + connectionId.getAddress() + “’ has failed. This might indicate that the remote task " + “manager has been lost.”, connectionId.getAddress(), future.cause())); } else { notifyOfError(new LocalTransportException( String.format( “Connecting to remote task manager ‘%s’ has been cancelled.”, connectionId.getAddress()), null)); } } }ConnectingChannel实现了netty的ChannelFutureListener接口，它的operationComplete方法在ChannelFuture是success的时候会调用handInChannel方法，该方法会创建PartitionRequestClient；waitForChannel方法则会等待partitionRequestClient创建成功然后返回小结ConnectionManager定义了start、shutdown、closeOpenChannelConnections等方法用于管理physical connections；它有两个子类，一个是LocalConnectionManager，一个是NettyConnectionManagerLocalConnectionManager实现了ConnectionManager接口，不过它的实现基本是空操作；NettyConnectionManager实现了ConnectionManager接口，它的构造器使用NettyConfig创建了NettyServer、NettyClient、NettyBufferPool，同时使用NettyClient创建了PartitionRequestClientFactory，start方法创建了NettyProtocol，同时初始化NettyClient、NettyServer，shutdown方法则关闭NettyClient、NettyServer，closeOpenChannelConnections则是使用partitionRequestClientFactory.closeOpenChannelConnections来关闭指定的connectionId，createPartitionRequestClient方法通过partitionRequestClientFactory.createPartitionRequestClient来创建PartitionRequestClientPartitionRequestClientFactory的构造器需要一个NettyClient；它使用ConcurrentHashMap在内存维护了一个ConnectionID与PartitionRequestClient或ConnectingChannel的映射关系；ConnectingChannel实现了netty的ChannelFutureListener接口，它的operationComplete方法在ChannelFuture是success的时候会调用handInChannel方法，该方法会创建PartitionRequestClient；waitForChannel方法则会等待partitionRequestClient创建成功然后返回docConnectionManager ...

聊聊flink的NetworkBufferPool

序本文主要研究一下flink的NetworkBufferPoolBufferPoolFactoryflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/buffer/BufferPoolFactory.javapublic interface BufferPoolFactory { /** * Tries to create a buffer pool, which is guaranteed to provide at least the number of required * buffers. * * The buffer pool is of dynamic size with at least <tt>numRequiredBuffers</tt> buffers. * * @param numRequiredBuffers * minimum number of network buffers in this pool * @param maxUsedBuffers * maximum number of network buffers this pool offers / BufferPool createBufferPool(int numRequiredBuffers, int maxUsedBuffers) throws IOException; /* * Tries to create a buffer pool with an optional owner, which is guaranteed to provide at least the * number of required buffers. * * The buffer pool is of dynamic size with at least <tt>numRequiredBuffers</tt> buffers. * * @param numRequiredBuffers * minimum number of network buffers in this pool * @param maxUsedBuffers * maximum number of network buffers this pool offers * @param owner * the optional owner of this buffer pool to release memory when needed / BufferPool createBufferPool(int numRequiredBuffers, int maxUsedBuffers, Optional<BufferPoolOwner> owner) throws IOException; /* * Destroy callback for updating factory book keeping. / void destroyBufferPool(BufferPool bufferPool) throws IOException;}BufferPoolFactory定义了createBufferPool、destroyBufferPool方法；其中createBufferPool支持numRequiredBuffers、maxUsedBuffers、owner参数NetworkBufferPoolflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/buffer/NetworkBufferPool.javapublic class NetworkBufferPool implements BufferPoolFactory { //…… private static final Logger LOG = LoggerFactory.getLogger(NetworkBufferPool.class); private final int totalNumberOfMemorySegments; private final int memorySegmentSize; private final ArrayBlockingQueue<MemorySegment> availableMemorySegments; private volatile boolean isDestroyed; // —- Managed buffer pools ———————————————- private final Object factoryLock = new Object(); private final Set<LocalBufferPool> allBufferPools = new HashSet<>(); private int numTotalRequiredBuffers; /* * Allocates all {@link MemorySegment} instances managed by this pool. / public NetworkBufferPool(int numberOfSegmentsToAllocate, int segmentSize) { this.totalNumberOfMemorySegments = numberOfSegmentsToAllocate; this.memorySegmentSize = segmentSize; final long sizeInLong = (long) segmentSize; try { this.availableMemorySegments = new ArrayBlockingQueue<>(numberOfSegmentsToAllocate); } catch (OutOfMemoryError err) { throw new OutOfMemoryError(“Could not allocate buffer queue of length " + numberOfSegmentsToAllocate + " - " + err.getMessage()); } try { for (int i = 0; i < numberOfSegmentsToAllocate; i++) { availableMemorySegments.add(MemorySegmentFactory.allocateUnpooledOffHeapMemory(segmentSize, null)); } } catch (OutOfMemoryError err) { int allocated = availableMemorySegments.size(); // free some memory availableMemorySegments.clear(); long requiredMb = (sizeInLong * numberOfSegmentsToAllocate) >> 20; long allocatedMb = (sizeInLong * allocated) >> 20; long missingMb = requiredMb - allocatedMb; throw new OutOfMemoryError(“Could not allocate enough memory segments for NetworkBufferPool " + “(required (Mb): " + requiredMb + “, allocated (Mb): " + allocatedMb + “, missing (Mb): " + missingMb + “). Cause: " + err.getMessage()); } long allocatedMb = (sizeInLong * availableMemorySegments.size()) >> 20; LOG.info(“Allocated {} MB for network buffer pool (number of memory segments: {}, bytes per segment: {}).”, allocatedMb, availableMemorySegments.size(), segmentSize); } @Override public BufferPool createBufferPool(int numRequiredBuffers, int maxUsedBuffers) throws IOException { return createBufferPool(numRequiredBuffers, maxUsedBuffers, Optional.empty()); } @Override public BufferPool createBufferPool(int numRequiredBuffers, int maxUsedBuffers, Optional<BufferPoolOwner> owner) throws IOException { // It is necessary to use a separate lock from the one used for buffer // requests to ensure deadlock freedom for failure cases. synchronized (factoryLock) { if (isDestroyed) { throw new IllegalStateException(“Network buffer pool has already been destroyed.”); } // Ensure that the number of required buffers can be satisfied. // With dynamic memory management this should become obsolete. if (numTotalRequiredBuffers + numRequiredBuffers > totalNumberOfMemorySegments) { throw new IOException(String.format(“Insufficient number of network buffers: " + “required %d, but only %d available. The total number of network " + “buffers is currently set to %d of %d bytes each. You can increase this " + “number by setting the configuration keys ‘%s’, ‘%s’, and ‘%s’.”, numRequiredBuffers, totalNumberOfMemorySegments - numTotalRequiredBuffers, totalNumberOfMemorySegments, memorySegmentSize, TaskManagerOptions.NETWORK_BUFFERS_MEMORY_FRACTION.key(), TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MIN.key(), TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MAX.key())); } this.numTotalRequiredBuffers += numRequiredBuffers; // We are good to go, create a new buffer pool and redistribute // non-fixed size buffers. LocalBufferPool localBufferPool = new LocalBufferPool(this, numRequiredBuffers, maxUsedBuffers, owner); allBufferPools.add(localBufferPool); try { redistributeBuffers(); } catch (IOException e) { try { destroyBufferPool(localBufferPool); } catch (IOException inner) { e.addSuppressed(inner); } ExceptionUtils.rethrowIOException(e); } return localBufferPool; } } @Override public void destroyBufferPool(BufferPool bufferPool) throws IOException { if (!(bufferPool instanceof LocalBufferPool)) { throw new IllegalArgumentException(“bufferPool is no LocalBufferPool”); } synchronized (factoryLock) { if (allBufferPools.remove(bufferPool)) { numTotalRequiredBuffers -= bufferPool.getNumberOfRequiredMemorySegments(); redistributeBuffers(); } } } //……}NetworkBufferPool实现了BufferPoolFactory接口，它的构造器接收numberOfSegmentsToAllocate、segmentSize两个参数；构造器里头根据numberOfSegmentsToAllocate创建了availableMemorySegments这个ArrayBlockingQueue，然后通过MemorySegmentFactory.allocateUnpooledOffHeapMemory挨个创建MemorySegment添加到availableMemorySegmentscreateBufferPool方法创建的是LocalBufferPool(传递了自身NetworkBufferPool实例进去)，然后添加到allBufferPools这个set中，同时增加numTotalRequiredBuffers；destroyBufferPool方法则从allBufferPools移除该bufferPool，同时减少numTotalRequiredBufferscreateBufferPool方法及destroyBufferPool方法会调用到redistributeBuffers方法，通过调用LocalBufferPool的setNumBuffers方法来调整buffer pool的大小LocalBufferPoolflink-release-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/buffer/LocalBufferPool.javaclass LocalBufferPool implements BufferPool { private static final Logger LOG = LoggerFactory.getLogger(LocalBufferPool.class); /* Global network buffer pool to get buffers from. / private final NetworkBufferPool networkBufferPool; /* The minimum number of required segments for this pool. / private final int numberOfRequiredMemorySegments; /* * The currently available memory segments. These are segments, which have been requested from * the network buffer pool and are currently not handed out as Buffer instances. * * BEWARE: Take special care with the interactions between this lock and * locks acquired before entering this class vs. locks being acquired during calls to external * code inside this class, e.g. with * {@link org.apache.flink.runtime.io.network.partition.consumer.RemoteInputChannel#bufferQueue} * via the {@link #registeredListeners} callback. / private final ArrayDeque<MemorySegment> availableMemorySegments = new ArrayDeque<MemorySegment>(); /* * Buffer availability listeners, which need to be notified when a Buffer becomes available. * Listeners can only be registered at a time/state where no Buffer instance was available. / private final ArrayDeque<BufferListener> registeredListeners = new ArrayDeque<>(); /* Maximum number of network buffers to allocate. / private final int maxNumberOfMemorySegments; /* The current size of this pool. / private int currentPoolSize; /* * Number of all memory segments, which have been requested from the network buffer pool and are * somehow referenced through this pool (e.g. wrapped in Buffer instances or as available segments). / private int numberOfRequestedMemorySegments; private boolean isDestroyed; private final Optional<BufferPoolOwner> owner; /* * Local buffer pool based on the given <tt>networkBufferPool</tt> with a minimal number of * network buffers being available. * * @param networkBufferPool * global network buffer pool to get buffers from * @param numberOfRequiredMemorySegments * minimum number of network buffers / LocalBufferPool(NetworkBufferPool networkBufferPool, int numberOfRequiredMemorySegments) { this(networkBufferPool, numberOfRequiredMemorySegments, Integer.MAX_VALUE, Optional.empty()); } /* * Local buffer pool based on the given <tt>networkBufferPool</tt> with a minimal and maximal * number of network buffers being available. * * @param networkBufferPool * global network buffer pool to get buffers from * @param numberOfRequiredMemorySegments * minimum number of network buffers * @param maxNumberOfMemorySegments * maximum number of network buffers to allocate / LocalBufferPool(NetworkBufferPool networkBufferPool, int numberOfRequiredMemorySegments, int maxNumberOfMemorySegments) { this(networkBufferPool, numberOfRequiredMemorySegments, maxNumberOfMemorySegments, Optional.empty()); } /* * Local buffer pool based on the given <tt>networkBufferPool</tt> and <tt>bufferPoolOwner</tt> * with a minimal and maximal number of network buffers being available. * * @param networkBufferPool * global network buffer pool to get buffers from * @param numberOfRequiredMemorySegments * minimum number of network buffers * @param maxNumberOfMemorySegments * maximum number of network buffers to allocate * @param owner * the optional owner of this buffer pool to release memory when needed */ LocalBufferPool( NetworkBufferPool networkBufferPool, int numberOfRequiredMemorySegments, int maxNumberOfMemorySegments, Optional<BufferPoolOwner> owner) { checkArgument(maxNumberOfMemorySegments >= numberOfRequiredMemorySegments, “Maximum number of memory segments (%s) should not be smaller than minimum (%s).”, maxNumberOfMemorySegments, numberOfRequiredMemorySegments); checkArgument(maxNumberOfMemorySegments > 0, “Maximum number of memory segments (%s) should be larger than 0.”, maxNumberOfMemorySegments); LOG.debug(“Using a local buffer pool with {}-{} buffers”, numberOfRequiredMemorySegments, maxNumberOfMemorySegments); this.networkBufferPool = networkBufferPool; this.numberOfRequiredMemorySegments = numberOfRequiredMemorySegments; this.currentPoolSize = numberOfRequiredMemorySegments; this.maxNumberOfMemorySegments = maxNumberOfMemorySegments; this.owner = owner; } //…… @Override public void recycle(MemorySegment segment) { BufferListener listener; NotificationResult notificationResult = NotificationResult.BUFFER_NOT_USED; while (!notificationResult.isBufferUsed()) { synchronized (availableMemorySegments) { if (isDestroyed || numberOfRequestedMemorySegments > currentPoolSize) { returnMemorySegment(segment); return; } else { listener = registeredListeners.poll(); if (listener == null) { availableMemorySegments.add(segment); availableMemorySegments.notify(); return; } } } notificationResult = fireBufferAvailableNotification(listener, segment); } } private MemorySegment requestMemorySegment(boolean isBlocking) throws InterruptedException, IOException { synchronized (availableMemorySegments) { returnExcessMemorySegments(); boolean askToRecycle = owner.isPresent(); // fill availableMemorySegments with at least one element, wait if required while (availableMemorySegments.isEmpty()) { if (isDestroyed) { throw new IllegalStateException(“Buffer pool is destroyed.”); } if (numberOfRequestedMemorySegments < currentPoolSize) { final MemorySegment segment = networkBufferPool.requestMemorySegment(); if (segment != null) { numberOfRequestedMemorySegments++; return segment; } } if (askToRecycle) { owner.get().releaseMemory(1); } if (isBlocking) { availableMemorySegments.wait(2000); } else { return null; } } return availableMemorySegments.poll(); } } //……}LocalBufferPool的构造器要求传入NetworkBufferPool，而其内部的requestMemorySegment方法，在availableMemorySegments为空且numberOfRequestedMemorySegments < currentPoolSize时，会调用networkBufferPool.requestMemorySegment()来申请MemorySegment；而recycle方法在numberOfRequestedMemorySegments > currentPoolSize时会归还MemorySegment到networkBufferPool，否则在BufferListener为null的时候会归还到availableMemorySegments小结BufferPoolFactory定义了createBufferPool、destroyBufferPool方法；其中createBufferPool支持numRequiredBuffers、maxUsedBuffers、owner参数；NetworkBufferPool实现了BufferPoolFactory接口，它的构造器接收numberOfSegmentsToAllocate、segmentSize两个参数；构造器里头根据numberOfSegmentsToAllocate创建了availableMemorySegments这个ArrayBlockingQueue，然后通过MemorySegmentFactory.allocateUnpooledOffHeapMemory挨个创建MemorySegment添加到availableMemorySegmentsNetworkBufferPool的createBufferPool方法创建的是LocalBufferPool(传递了自身NetworkBufferPool实例进去)，然后添加到allBufferPools这个set中，同时增加numTotalRequiredBuffers；destroyBufferPool方法则从allBufferPools移除该bufferPool，同时减少numTotalRequiredBuffers；createBufferPool方法及destroyBufferPool方法会调用到redistributeBuffers方法，通过调用LocalBufferPool的setNumBuffers方法来调整buffer pool的大小LocalBufferPool的构造器要求传入NetworkBufferPool，而其内部的requestMemorySegment方法，在availableMemorySegments为空且numberOfRequestedMemorySegments < currentPoolSize时，会调用networkBufferPool.requestMemorySegment()来申请MemorySegment；而recycle方法在numberOfRequestedMemorySegments > currentPoolSize时会归还MemorySegment到networkBufferPool，否则在BufferListener为null的时候会归还到availableMemorySegmentsdocNetworkBufferPool ...

《从0到1学习Flink》—— Flink 读取 Kafka 数据批量写入到 MySQL

<!– more –>前言之前其实在《从0到1学习Flink》—— 如何自定义 Data Sink ？文章中其实已经写了点将数据写入到 MySQL，但是一些配置化的东西当时是写死的，不能够通用，最近知识星球里有朋友叫我: 写个从 kafka 中读取数据，经过 Flink 做个预聚合，然后创建数据库连接池将数据批量写入到 mysql 的例子。于是才有了这篇文章，更多提问和想要我写的文章可以在知识星球里像我提问，我会根据提问及时回答和尽可能作出文章的修改。准备你需要将这两个依赖添加到 pom.xml 中<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.34</version></dependency>读取 kafka 数据这里我依旧用的以前的 student 类，自己本地起了 kafka 然后造一些测试数据，这里我们测试发送一条数据则 sleep 10s，意味着往 kafka 中一分钟发 6 条数据。package com.zhisheng.connectors.mysql.utils;import com.zhisheng.common.utils.GsonUtil;import com.zhisheng.connectors.mysql.model.Student;import org.apache.kafka.clients.producer.KafkaProducer;import org.apache.kafka.clients.producer.ProducerRecord;import java.util.Properties;/** * Desc: 往kafka中写数据,可以使用这个main函数进行测试 * Created by zhisheng on 2019-02-17 * Blog: http://www.54tianzhisheng.cn/tags/Flink/ /public class KafkaUtil { public static final String broker_list = “localhost:9092”; public static final String topic = “student”; //kafka topic 需要和 flink 程序用同一个 topic public static void writeToKafka() throws InterruptedException { Properties props = new Properties(); props.put(“bootstrap.servers”, broker_list); props.put(“key.serializer”, “org.apache.kafka.common.serialization.StringSerializer”); props.put(“value.serializer”, “org.apache.kafka.common.serialization.StringSerializer”); KafkaProducer producer = new KafkaProducer<String, String>(props); for (int i = 1; i <= 100; i++) { Student student = new Student(i, “zhisheng” + i, “password” + i, 18 + i); ProducerRecord record = new ProducerRecord<String, String>(topic, null, null, GsonUtil.toJson(student)); producer.send(record); System.out.println(“发送数据: " + GsonUtil.toJson(student)); Thread.sleep(10 * 1000); //发送一条数据 sleep 10s，相当于 1 分钟 6 条 } producer.flush(); } public static void main(String[] args) throws InterruptedException { writeToKafka(); }}从 kafka 中读取数据，然后序列化成 student 对象。final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();Properties props = new Properties();props.put(“bootstrap.servers”, “localhost:9092”);props.put(“zookeeper.connect”, “localhost:2181”);props.put(“group.id”, “metric-group”);props.put(“key.deserializer”, “org.apache.kafka.common.serialization.StringDeserializer”);props.put(“value.deserializer”, “org.apache.kafka.common.serialization.StringDeserializer”);props.put(“auto.offset.reset”, “latest”);SingleOutputStreamOperator<Student> student = env.addSource(new FlinkKafkaConsumer011<>( “student”, //这个 kafka topic 需要和上面的工具类的 topic 一致 new SimpleStringSchema(), props)).setParallelism(1) .map(string -> GsonUtil.fromJson(string, Student.class)); //，解析字符串成 student 对象因为 RichSinkFunction 中如果 sink 一条数据到 mysql 中就会调用 invoke 方法一次，所以如果要实现批量写的话，我们最好在 sink 之前就把数据聚合一下。那这里我们开个一分钟的窗口去聚合 Student 数据。student.timeWindowAll(Time.minutes(1)).apply(new AllWindowFunction<Student, List<Student>, TimeWindow>() { @Override public void apply(TimeWindow window, Iterable<Student> values, Collector<List<Student>> out) throws Exception { ArrayList<Student> students = Lists.newArrayList(values); if (students.size() > 0) { System.out.println(“1 分钟内收集到 student 的数据条数是：” + students.size()); out.collect(students); } }});写入数据库这里使用 DBCP 连接池连接数据库 mysql，pom.xml 中添加依赖：<dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-dbcp2</artifactId> <version>2.1.1</version></dependency>如果你想使用其他的数据库连接池请加入对应的依赖。这里将数据写入到 MySQL 中，依旧是和之前文章一样继承 RichSinkFunction 类，重写里面的方法：package com.zhisheng.connectors.mysql.sinks;import com.zhisheng.connectors.mysql.model.Student;import org.apache.commons.dbcp2.BasicDataSource;import org.apache.flink.configuration.Configuration;import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;import javax.sql.DataSource;import java.sql.Connection;import java.sql.DriverManager;import java.sql.PreparedStatement;import java.util.List;/* * Desc: 数据批量 sink 数据到 mysql * Created by zhisheng_tian on 2019-02-17 * Blog: http://www.54tianzhisheng.cn/tags/Flink/ /public class SinkToMySQL extends RichSinkFunction<List<Student>> { PreparedStatement ps; BasicDataSource dataSource; private Connection connection; /* * open() 方法中建立连接，这样不用每次 invoke 的时候都要建立连接和释放连接 * * @param parameters * @throws Exception / @Override public void open(Configuration parameters) throws Exception { super.open(parameters); dataSource = new BasicDataSource(); connection = getConnection(dataSource); String sql = “insert into Student(id, name, password, age) values(?, ?, ?, ?);”; ps = this.connection.prepareStatement(sql); } @Override public void close() throws Exception { super.close(); //关闭连接和释放资源 if (connection != null) { connection.close(); } if (ps != null) { ps.close(); } } /* * 每条数据的插入都要调用一次 invoke() 方法 * * @param value * @param context * @throws Exception */ @Override public void invoke(List<Student> value, Context context) throws Exception { //遍历数据集合 for (Student student : value) { ps.setInt(1, student.getId()); ps.setString(2, student.getName()); ps.setString(3, student.getPassword()); ps.setInt(4, student.getAge()); ps.addBatch(); } int[] count = ps.executeBatch();//批量后执行 System.out.println(“成功了插入了” + count.length + “行数据”); } private static Connection getConnection(BasicDataSource dataSource) { dataSource.setDriverClassName(“com.mysql.jdbc.Driver”); //注意，替换成自己本地的 mysql 数据库地址和用户名、密码 dataSource.setUrl(“jdbc:mysql://localhost:3306/test”); dataSource.setUsername(“root”); dataSource.setPassword(“root123456”); //设置连接池的一些参数 dataSource.setInitialSize(10); dataSource.setMaxTotal(50); dataSource.setMinIdle(2); Connection con = null; try { con = dataSource.getConnection(); System.out.println(“创建连接池：” + con); } catch (Exception e) { System.out.println(”———–mysql get connection has exception , msg = " + e.getMessage()); } return con; }}核心类 Main核心程序如下：public class Main { public static void main(String[] args) throws Exception{ final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); Properties props = new Properties(); props.put(“bootstrap.servers”, “localhost:9092”); props.put(“zookeeper.connect”, “localhost:2181”); props.put(“group.id”, “metric-group”); props.put(“key.deserializer”, “org.apache.kafka.common.serialization.StringDeserializer”); props.put(“value.deserializer”, “org.apache.kafka.common.serialization.StringDeserializer”); props.put(“auto.offset.reset”, “latest”); SingleOutputStreamOperator<Student> student = env.addSource(new FlinkKafkaConsumer011<>( “student”, //这个 kafka topic 需要和上面的工具类的 topic 一致 new SimpleStringSchema(), props)).setParallelism(1) .map(string -> GsonUtil.fromJson(string, Student.class)); // student.timeWindowAll(Time.minutes(1)).apply(new AllWindowFunction<Student, List<Student>, TimeWindow>() { @Override public void apply(TimeWindow window, Iterable<Student> values, Collector<List<Student>> out) throws Exception { ArrayList<Student> students = Lists.newArrayList(values); if (students.size() > 0) { System.out.println(“1 分钟内收集到 student 的数据条数是：” + students.size()); out.collect(students); } } }).addSink(new SinkToMySQL()); env.execute(“flink learning connectors kafka”); }}运行项目运行 Main 类后再运行 KafkaUtils.java 类！下图是往 Kafka 中发送的数据：下图是运行 Main 类的日志，会创建 4 个连接池是因为默认的 4 个并行度，你如果在 addSink 这个算子设置并行度为 1 的话就会创建一个连接池：下图是批量插入数据库的结果：总结本文从知识星球一位朋友的疑问来写的，应该都满足了他的条件（批量/数据库连接池/写入mysql），的确网上很多的例子都是简单的 demo 形式，都是单条数据就创建数据库连接插入 MySQL，如果要写的数据量很大的话，会对 MySQL 的写有很大的压力。这也是我之前在《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch 中，数据写 ES 强调过的，如果要提高性能必定要批量的写。就拿我们现在这篇文章来说，如果数据量大的话，聚合一分钟数据达万条，那么这样批量写会比来一条写一条性能提高不知道有多少。本文原创地址是: http://www.54tianzhisheng.cn/2019/01/15/Flink-MySQL-sink/ , 未经允许禁止转载。关注我微信公众号：zhisheng另外我自己整理了些 Flink 的学习资料，目前已经全部放到微信公众号了。你可以加我的微信：zhisheng_tian，然后回复关键字：Flink 即可无条件获取到。更多私密资料请加入知识星球！Github 代码仓库https://github.com/zhisheng17/flink-learning/以后这个项目的所有代码都将放在这个仓库里，包含了自己学习 flink 的一些 demo 和博客。本文的项目代码在 https://github.com/zhisheng17/flink-learning/tree/master/flink-learning-connectors/flink-learning-connectors-mysql相关文章1、《从0到1学习Flink》—— Apache Flink 介绍2、《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门3、《从0到1学习Flink》—— Flink 配置文件详解4、《从0到1学习Flink》—— Data Source 介绍5、《从0到1学习Flink》—— 如何自定义 Data Source ？6、《从0到1学习Flink》—— Data Sink 介绍7、《从0到1学习Flink》—— 如何自定义 Data Sink ？8、《从0到1学习Flink》—— Flink Data transformation(转换)9、《从0到1学习Flink》—— 介绍Flink中的Stream Windows10、《从0到1学习Flink》—— Flink 中的几种 Time 详解11、《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch12、《从0到1学习Flink》—— Flink 项目如何运行？13、《从0到1学习Flink》—— Flink 写入数据到 Kafka14、《从0到1学习Flink》—— Flink JobManager 高可用性配置15、《从0到1学习Flink》—— Flink parallelism 和 Slot 介绍16、《从0到1学习Flink》—— Flink 读取 Kafka 数据批量写入到 MySQL ...

聊聊flink的NetworkEnvironmentConfiguration

序本文主要研究一下flink的NetworkEnvironmentConfigurationNetworkEnvironmentConfigurationflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/taskmanager/NetworkEnvironmentConfiguration.javapublic class NetworkEnvironmentConfiguration { private final float networkBufFraction; private final long networkBufMin; private final long networkBufMax; private final int networkBufferSize; private final IOMode ioMode; private final int partitionRequestInitialBackoff; private final int partitionRequestMaxBackoff; private final int networkBuffersPerChannel; private final int floatingNetworkBuffersPerGate; private final NettyConfig nettyConfig; /** * Constructor for a setup with purely local communication (no netty). / public NetworkEnvironmentConfiguration( float networkBufFraction, long networkBufMin, long networkBufMax, int networkBufferSize, IOMode ioMode, int partitionRequestInitialBackoff, int partitionRequestMaxBackoff, int networkBuffersPerChannel, int floatingNetworkBuffersPerGate) { this(networkBufFraction, networkBufMin, networkBufMax, networkBufferSize, ioMode, partitionRequestInitialBackoff, partitionRequestMaxBackoff, networkBuffersPerChannel, floatingNetworkBuffersPerGate, null); } public NetworkEnvironmentConfiguration( float networkBufFraction, long networkBufMin, long networkBufMax, int networkBufferSize, IOMode ioMode, int partitionRequestInitialBackoff, int partitionRequestMaxBackoff, int networkBuffersPerChannel, int floatingNetworkBuffersPerGate, @Nullable NettyConfig nettyConfig) { this.networkBufFraction = networkBufFraction; this.networkBufMin = networkBufMin; this.networkBufMax = networkBufMax; this.networkBufferSize = networkBufferSize; this.ioMode = ioMode; this.partitionRequestInitialBackoff = partitionRequestInitialBackoff; this.partitionRequestMaxBackoff = partitionRequestMaxBackoff; this.networkBuffersPerChannel = networkBuffersPerChannel; this.floatingNetworkBuffersPerGate = floatingNetworkBuffersPerGate; this.nettyConfig = nettyConfig; } // ———————————————————————— public float networkBufFraction() { return networkBufFraction; } public long networkBufMin() { return networkBufMin; } public long networkBufMax() { return networkBufMax; } public int networkBufferSize() { return networkBufferSize; } public IOMode ioMode() { return ioMode; } public int partitionRequestInitialBackoff() { return partitionRequestInitialBackoff; } public int partitionRequestMaxBackoff() { return partitionRequestMaxBackoff; } public int networkBuffersPerChannel() { return networkBuffersPerChannel; } public int floatingNetworkBuffersPerGate() { return floatingNetworkBuffersPerGate; } public NettyConfig nettyConfig() { return nettyConfig; } // ———————————————————————— @Override public int hashCode() { int result = 1; result = 31 * result + networkBufferSize; result = 31 * result + ioMode.hashCode(); result = 31 * result + partitionRequestInitialBackoff; result = 31 * result + partitionRequestMaxBackoff; result = 31 * result + networkBuffersPerChannel; result = 31 * result + floatingNetworkBuffersPerGate; result = 31 * result + (nettyConfig != null ? nettyConfig.hashCode() : 0); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } else if (obj == null || getClass() != obj.getClass()) { return false; } else { final NetworkEnvironmentConfiguration that = (NetworkEnvironmentConfiguration) obj; return this.networkBufFraction == that.networkBufFraction && this.networkBufMin == that.networkBufMin && this.networkBufMax == that.networkBufMax && this.networkBufferSize == that.networkBufferSize && this.partitionRequestInitialBackoff == that.partitionRequestInitialBackoff && this.partitionRequestMaxBackoff == that.partitionRequestMaxBackoff && this.networkBuffersPerChannel == that.networkBuffersPerChannel && this.floatingNetworkBuffersPerGate == that.floatingNetworkBuffersPerGate && this.ioMode == that.ioMode && (nettyConfig != null ? nettyConfig.equals(that.nettyConfig) : that.nettyConfig == null); } } @Override public String toString() { return “NetworkEnvironmentConfiguration{” + “networkBufFraction=” + networkBufFraction + “, networkBufMin=” + networkBufMin + “, networkBufMax=” + networkBufMax + “, networkBufferSize=” + networkBufferSize + “, ioMode=” + ioMode + “, partitionRequestInitialBackoff=” + partitionRequestInitialBackoff + “, partitionRequestMaxBackoff=” + partitionRequestMaxBackoff + “, networkBuffersPerChannel=” + networkBuffersPerChannel + “, floatingNetworkBuffersPerGate=” + floatingNetworkBuffersPerGate + “, nettyConfig=” + nettyConfig + ‘}’; }}NetworkEnvironmentConfiguration主要是flink network的相关配置，里头有networkBufFraction、networkBufMin、networkBufMax、networkBufferSize、ioMode、partitionRequestInitialBackoff、partitionRequestMaxBackoff、networkBuffersPerChannel、floatingNetworkBuffersPerGate、nettyConfig属性TaskManagerServicesConfigurationflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskManagerServicesConfiguration.javapublic class TaskManagerServicesConfiguration { //…… /* * Creates the {@link NetworkEnvironmentConfiguration} from the given {@link Configuration}. * * @param configuration to create the network environment configuration from * @param localTaskManagerCommunication true if task manager communication is local * @param taskManagerAddress address of the task manager * @param slots to start the task manager with * @return Network environment configuration / @SuppressWarnings(“deprecation”) private static NetworkEnvironmentConfiguration parseNetworkEnvironmentConfiguration( Configuration configuration, boolean localTaskManagerCommunication, InetAddress taskManagerAddress, int slots) throws Exception { // —-> hosts / ports for communication and data exchange int dataport = configuration.getInteger(TaskManagerOptions.DATA_PORT); checkConfigParameter(dataport >= 0, dataport, TaskManagerOptions.DATA_PORT.key(), “Leave config parameter empty or use 0 to let the system choose a port automatically.”); checkConfigParameter(slots >= 1, slots, TaskManagerOptions.NUM_TASK_SLOTS.key(), “Number of task slots must be at least one.”); final int pageSize = checkedDownCast(MemorySize.parse(configuration.getString(TaskManagerOptions.MEMORY_SEGMENT_SIZE)).getBytes()); // check page size of for minimum size checkConfigParameter(pageSize >= MemoryManager.MIN_PAGE_SIZE, pageSize, TaskManagerOptions.MEMORY_SEGMENT_SIZE.key(), “Minimum memory segment size is " + MemoryManager.MIN_PAGE_SIZE); // check page size for power of two checkConfigParameter(MathUtils.isPowerOf2(pageSize), pageSize, TaskManagerOptions.MEMORY_SEGMENT_SIZE.key(), “Memory segment size must be a power of 2.”); // network buffer memory fraction float networkBufFraction = configuration.getFloat(TaskManagerOptions.NETWORK_BUFFERS_MEMORY_FRACTION); long networkBufMin = MemorySize.parse(configuration.getString(TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MIN)).getBytes(); long networkBufMax = MemorySize.parse(configuration.getString(TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MAX)).getBytes(); checkNetworkBufferConfig(pageSize, networkBufFraction, networkBufMin, networkBufMax); // fallback: number of network buffers final int numNetworkBuffers = configuration.getInteger(TaskManagerOptions.NETWORK_NUM_BUFFERS); checkNetworkConfigOld(numNetworkBuffers); if (!hasNewNetworkBufConf(configuration)) { // map old config to new one: networkBufMin = networkBufMax = ((long) numNetworkBuffers) * pageSize; } else { if (configuration.contains(TaskManagerOptions.NETWORK_NUM_BUFFERS)) { LOG.info(“Ignoring old (but still present) network buffer configuration via {}.”, TaskManagerOptions.NETWORK_NUM_BUFFERS.key()); } } final NettyConfig nettyConfig; if (!localTaskManagerCommunication) { final InetSocketAddress taskManagerInetSocketAddress = new InetSocketAddress(taskManagerAddress, dataport); nettyConfig = new NettyConfig(taskManagerInetSocketAddress.getAddress(), taskManagerInetSocketAddress.getPort(), pageSize, slots, configuration); } else { nettyConfig = null; } // Default spill I/O mode for intermediate results final String syncOrAsync = configuration.getString( ConfigConstants.TASK_MANAGER_NETWORK_DEFAULT_IO_MODE, ConfigConstants.DEFAULT_TASK_MANAGER_NETWORK_DEFAULT_IO_MODE); final IOManager.IOMode ioMode; if (syncOrAsync.equals(“async”)) { ioMode = IOManager.IOMode.ASYNC; } else { ioMode = IOManager.IOMode.SYNC; } int initialRequestBackoff = configuration.getInteger( TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL); int maxRequestBackoff = configuration.getInteger( TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX); int buffersPerChannel = configuration.getInteger( TaskManagerOptions.NETWORK_BUFFERS_PER_CHANNEL); int extraBuffersPerGate = configuration.getInteger( TaskManagerOptions.NETWORK_EXTRA_BUFFERS_PER_GATE); return new NetworkEnvironmentConfiguration( networkBufFraction, networkBufMin, networkBufMax, pageSize, ioMode, initialRequestBackoff, maxRequestBackoff, buffersPerChannel, extraBuffersPerGate, nettyConfig); } //……}TaskManagerServicesConfiguration有个私有方法parseNetworkEnvironmentConfiguration，用于创建NetworkEnvironmentConfiguration；它会读取TaskManagerOptions.MEMORY_SEGMENT_SIZE、TaskManagerOptions.NETWORK_BUFFERS_MEMORY_FRACTION、TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MIN、TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MAX、TaskManagerOptions.NETWORK_NUM_BUFFERS、ConfigConstants.TASK_MANAGER_NETWORK_DEFAULT_IO_MODE、TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL、TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX、TaskManagerOptions.NETWORK_BUFFERS_PER_CHANNEL、TaskManagerOptions.NETWORK_EXTRA_BUFFERS_PER_GATE等配置TaskManagerOptionsflink-1.7.2/flink-core/src/main/java/org/apache/flink/configuration/TaskManagerOptions.java@PublicEvolvingpublic class TaskManagerOptions { //…… /* * Size of memory buffers used by the network stack and the memory manager. / public static final ConfigOption<String> MEMORY_SEGMENT_SIZE = key(“taskmanager.memory.segment-size”) .defaultValue(“32kb”) .withDescription(“Size of memory buffers used by the network stack and the memory manager.”); /* * Fraction of JVM memory to use for network buffers. / public static final ConfigOption<Float> NETWORK_BUFFERS_MEMORY_FRACTION = key(“taskmanager.network.memory.fraction”) .defaultValue(0.1f) .withDescription(“Fraction of JVM memory to use for network buffers. This determines how many streaming” + " data exchange channels a TaskManager can have at the same time and how well buffered the channels” + " are. If a job is rejected or you get a warning that the system has not enough buffers available," + " increase this value or the min/max values below. Also note, that "taskmanager.network.memory.min"" + “` and "taskmanager.network.memory.max" may override this fraction.”); /* * Minimum memory size for network buffers. / public static final ConfigOption<String> NETWORK_BUFFERS_MEMORY_MIN = key(“taskmanager.network.memory.min”) .defaultValue(“64mb”) .withDescription(“Minimum memory size for network buffers.”); /* * Maximum memory size for network buffers. / public static final ConfigOption<String> NETWORK_BUFFERS_MEMORY_MAX = key(“taskmanager.network.memory.max”) .defaultValue(“1gb”) .withDescription(“Maximum memory size for network buffers.”); /* * Number of buffers used in the network stack. This defines the number of possible tasks and * shuffles. * * @deprecated use {@link #NETWORK_BUFFERS_MEMORY_FRACTION}, {@link #NETWORK_BUFFERS_MEMORY_MIN}, * and {@link #NETWORK_BUFFERS_MEMORY_MAX} instead / @Deprecated public static final ConfigOption<Integer> NETWORK_NUM_BUFFERS = key(“taskmanager.network.numberOfBuffers”) .defaultValue(2048); /* * Minimum backoff for partition requests of input channels. / public static final ConfigOption<Integer> NETWORK_REQUEST_BACKOFF_INITIAL = key(“taskmanager.network.request-backoff.initial”) .defaultValue(100) .withDeprecatedKeys(“taskmanager.net.request-backoff.initial”) .withDescription(“Minimum backoff in milliseconds for partition requests of input channels.”); /* * Maximum backoff for partition requests of input channels. / public static final ConfigOption<Integer> NETWORK_REQUEST_BACKOFF_MAX = key(“taskmanager.network.request-backoff.max”) .defaultValue(10000) .withDeprecatedKeys(“taskmanager.net.request-backoff.max”) .withDescription(“Maximum backoff in milliseconds for partition requests of input channels.”); /* * Number of network buffers to use for each outgoing/incoming channel (subpartition/input channel). * * Reasoning: 1 buffer for in-flight data in the subpartition + 1 buffer for parallel serialization. / public static final ConfigOption<Integer> NETWORK_BUFFERS_PER_CHANNEL = key(“taskmanager.network.memory.buffers-per-channel”) .defaultValue(2) .withDescription(“Maximum number of network buffers to use for each outgoing/incoming channel (subpartition/input channel).” + “In credit-based flow control mode, this indicates how many credits are exclusive in each input channel. It should be” + " configured at least 2 for good performance. 1 buffer is for receiving in-flight data in the subpartition and 1 buffer is" + " for parallel serialization."); /* * Number of extra network buffers to use for each outgoing/incoming gate (result partition/input gate). */ public static final ConfigOption<Integer> NETWORK_EXTRA_BUFFERS_PER_GATE = key(“taskmanager.network.memory.floating-buffers-per-gate”) .defaultValue(8) .withDescription(“Number of extra network buffers to use for each outgoing/incoming gate (result partition/input gate).” + " In credit-based flow control mode, this indicates how many floating credits are shared among all the input channels." + " The floating buffers are distributed based on backlog (real-time output buffers in the subpartition) feedback, and can" + " help relieve back-pressure caused by unbalanced data distribution among the subpartitions. This value should be" + " increased in case of higher round trip times between nodes and/or larger number of machines in the cluster."); //……}taskmanager.memory.segment-size指定memory segment的大小，默认为32kb；taskmanager.network.memory.fraction指定network buffers使用的memory的比例，默认为0.1；taskmanager.network.memory.min指定network buffers使用的最小内存，默认为64mb；taskmanager.network.memory.max指定network buffers使用的最大内存，默认为1gb；taskmanager.network.numberOfBuffers指定network使用的buffers数量，默认为2048，该配置已经被废弃，使用taskmanager.network.memory.fraction、taskmanager.network.memory.min、taskmanager.network.memory.max这几个配置来替代taskmanager.network.request-backoff.initial指定input channels的partition requests的最小backoff时间(毫秒)，默认为100；taskmanager.network.request-backoff.max指定input channels的partition requests的最大backoff时间(毫秒)，默认为10000taskmanager.network.memory.buffers-per-channel指定每个outgoing/incoming channel使用buffers数量，默认为2；taskmanager.network.memory.floating-buffers-per-gate指定每个outgoing/incoming gate使用buffers数量，默认为8NettyConfigflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/io/network/netty/NettyConfig.javapublic class NettyConfig { private static final Logger LOG = LoggerFactory.getLogger(NettyConfig.class); // - Config keys ———————————————————- public static final ConfigOption<Integer> NUM_ARENAS = ConfigOptions .key(“taskmanager.network.netty.num-arenas”) .defaultValue(-1) .withDeprecatedKeys(“taskmanager.net.num-arenas”) .withDescription(“The number of Netty arenas.”); public static final ConfigOption<Integer> NUM_THREADS_SERVER = ConfigOptions .key(“taskmanager.network.netty.server.numThreads”) .defaultValue(-1) .withDeprecatedKeys(“taskmanager.net.server.numThreads”) .withDescription(“The number of Netty server threads.”); public static final ConfigOption<Integer> NUM_THREADS_CLIENT = ConfigOptions .key(“taskmanager.network.netty.client.numThreads”) .defaultValue(-1) .withDeprecatedKeys(“taskmanager.net.client.numThreads”) .withDescription(“The number of Netty client threads.”); public static final ConfigOption<Integer> CONNECT_BACKLOG = ConfigOptions .key(“taskmanager.network.netty.server.backlog”) .defaultValue(0) // default: 0 => Netty’s default .withDeprecatedKeys(“taskmanager.net.server.backlog”) .withDescription(“The netty server connection backlog.”); public static final ConfigOption<Integer> CLIENT_CONNECT_TIMEOUT_SECONDS = ConfigOptions .key(“taskmanager.network.netty.client.connectTimeoutSec”) .defaultValue(120) // default: 120s = 2min .withDeprecatedKeys(“taskmanager.net.client.connectTimeoutSec”) .withDescription(“The Netty client connection timeout.”); public static final ConfigOption<Integer> SEND_RECEIVE_BUFFER_SIZE = ConfigOptions .key(“taskmanager.network.netty.sendReceiveBufferSize”) .defaultValue(0) // default: 0 => Netty’s default .withDeprecatedKeys(“taskmanager.net.sendReceiveBufferSize”) .withDescription(“The Netty send and receive buffer size. This defaults to the system buffer size” + " (cat /proc/sys/net/ipv4/tcp_[rw]mem) and is 4 MiB in modern Linux."); public static final ConfigOption<String> TRANSPORT_TYPE = ConfigOptions .key(“taskmanager.network.netty.transport”) .defaultValue(“nio”) .withDeprecatedKeys(“taskmanager.net.transport”) .withDescription(“The Netty transport type, either "nio" or "epoll"”); // ———————————————————————— enum TransportType { NIO, EPOLL, AUTO } static final String SERVER_THREAD_GROUP_NAME = “Flink Netty Server”; static final String CLIENT_THREAD_GROUP_NAME = “Flink Netty Client”; private final InetAddress serverAddress; private final int serverPort; private final int memorySegmentSize; private final int numberOfSlots; private final Configuration config; // optional configuration public NettyConfig( InetAddress serverAddress, int serverPort, int memorySegmentSize, int numberOfSlots, Configuration config) { this.serverAddress = checkNotNull(serverAddress); checkArgument(serverPort >= 0 && serverPort <= 65536, “Invalid port number.”); this.serverPort = serverPort; checkArgument(memorySegmentSize > 0, “Invalid memory segment size.”); this.memorySegmentSize = memorySegmentSize; checkArgument(numberOfSlots > 0, “Number of slots”); this.numberOfSlots = numberOfSlots; this.config = checkNotNull(config); LOG.info(this.toString()); } InetAddress getServerAddress() { return serverAddress; } int getServerPort() { return serverPort; } int getMemorySegmentSize() { return memorySegmentSize; } public int getNumberOfSlots() { return numberOfSlots; } // ———————————————————————— // Getters // ———————————————————————— public int getServerConnectBacklog() { return config.getInteger(CONNECT_BACKLOG); } public int getNumberOfArenas() { // default: number of slots final int configValue = config.getInteger(NUM_ARENAS); return configValue == -1 ? numberOfSlots : configValue; } public int getServerNumThreads() { // default: number of task slots final int configValue = config.getInteger(NUM_THREADS_SERVER); return configValue == -1 ? numberOfSlots : configValue; } public int getClientNumThreads() { // default: number of task slots final int configValue = config.getInteger(NUM_THREADS_CLIENT); return configValue == -1 ? numberOfSlots : configValue; } public int getClientConnectTimeoutSeconds() { return config.getInteger(CLIENT_CONNECT_TIMEOUT_SECONDS); } public int getSendAndReceiveBufferSize() { return config.getInteger(SEND_RECEIVE_BUFFER_SIZE); } public TransportType getTransportType() { String transport = config.getString(TRANSPORT_TYPE); switch (transport) { case “nio”: return TransportType.NIO; case “epoll”: return TransportType.EPOLL; default: return TransportType.AUTO; } } @Nullable public SSLHandlerFactory createClientSSLEngineFactory() throws Exception { return getSSLEnabled() ? SSLUtils.createInternalClientSSLEngineFactory(config) : null; } @Nullable public SSLHandlerFactory createServerSSLEngineFactory() throws Exception { return getSSLEnabled() ? SSLUtils.createInternalServerSSLEngineFactory(config) : null; } public boolean getSSLEnabled() { return config.getBoolean(TaskManagerOptions.DATA_SSL_ENABLED) && SSLUtils.isInternalSSLEnabled(config); } public boolean isCreditBasedEnabled() { return config.getBoolean(TaskManagerOptions.NETWORK_CREDIT_MODEL); } public Configuration getConfig() { return config; } @Override public String toString() { String format = “NettyConfig [” + “server address: %s, " + “server port: %d, " + “ssl enabled: %s, " + “memory segment size (bytes): %d, " + “transport type: %s, " + “number of server threads: %d (%s), " + “number of client threads: %d (%s), " + “server connect backlog: %d (%s), " + “client connect timeout (sec): %d, " + “send/receive buffer size (bytes): %d (%s)]”; String def = “use Netty’s default”; String man = “manual”; return String.format(format, serverAddress, serverPort, getSSLEnabled() ? “true” : “false”, memorySegmentSize, getTransportType(), getServerNumThreads(), getServerNumThreads() == 0 ? def : man, getClientNumThreads(), getClientNumThreads() == 0 ? def : man, getServerConnectBacklog(), getServerConnectBacklog() == 0 ? def : man, getClientConnectTimeoutSeconds(), getSendAndReceiveBufferSize(), getSendAndReceiveBufferSize() == 0 ? def : man); }}NettyConfig的构造器接收serverAddress、serverPort、memorySegmentSize、numberOfSlots、config这几个参数；它还提供了getServerConnectBacklog、getNumberOfArenas、getServerNumThreads、getClientNumThreads、getClientConnectTimeoutSeconds、getSendAndReceiveBufferSize、getTransportType等方法用于从config读取配置taskmanager.network.netty.server.backlog用于指定netty server的connection backlog，默认值为0即使用netty默认的配置；taskmanager.network.netty.client.connectTimeoutSec指定netty client的connection timeout，默认为120(单位秒)；taskmanager.network.netty.sendReceiveBufferSize指定netty send/receive buffer大小，默认为0即使用netty的默认配置，默认是使用system buffer size，即/proc/sys/net/ipv4/tcp_[rw]mem的配置；taskmanager.network.netty.transport指定的是netty transport的类型，默认是niotaskmanager.network.netty.num-arenas指定的是netty arenas的数量，默认为-1；taskmanager.network.netty.server.numThreads指定的是netty server的threads数量，默认为-1；taskmanager.network.netty.client.numThreads指定的是netty client的threads数量，默认为-1；这几个配置当配置值为-1的时候，对应get方法返回的是numberOfSlots值小结NetworkEnvironmentConfiguration主要是flink network的相关配置，里头有networkBufFraction、networkBufMin、networkBufMax、networkBufferSize、ioMode、partitionRequestInitialBackoff、partitionRequestMaxBackoff、networkBuffersPerChannel、floatingNetworkBuffersPerGate、nettyConfig属性TaskManagerServicesConfiguration有个私有方法parseNetworkEnvironmentConfiguration，用于创建NetworkEnvironmentConfiguration；它会读取TaskManagerOptions.MEMORY_SEGMENT_SIZE、TaskManagerOptions.NETWORK_BUFFERS_MEMORY_FRACTION、TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MIN、TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MAX、TaskManagerOptions.NETWORK_NUM_BUFFERS、ConfigConstants.TASK_MANAGER_NETWORK_DEFAULT_IO_MODE、TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL、TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX、TaskManagerOptions.NETWORK_BUFFERS_PER_CHANNEL、TaskManagerOptions.NETWORK_EXTRA_BUFFERS_PER_GATE等配置NettyConfig的构造器接收serverAddress、serverPort、memorySegmentSize、numberOfSlots、config这几个参数；它还提供了getServerConnectBacklog、getNumberOfArenas、getServerNumThreads、getClientNumThreads、getClientConnectTimeoutSeconds、getSendAndReceiveBufferSize、getTransportType等方法用于从config读取配置doctaskmanager-network-memory-fraction ...

聊聊flink taskmanager的jvm-exit-on-oom配置

序本文主要研究一下flink taskmanager的jvm-exit-on-oom配置taskmanager.jvm-exit-on-oomflink-1.7.2/flink-core/src/main/java/org/apache/flink/configuration/TaskManagerOptions.java@PublicEvolvingpublic class TaskManagerOptions { //…… /** * Whether to kill the TaskManager when the task thread throws an OutOfMemoryError. / public static final ConfigOption<Boolean> KILL_ON_OUT_OF_MEMORY = key(“taskmanager.jvm-exit-on-oom”) .defaultValue(false) .withDescription(“Whether to kill the TaskManager when the task thread throws an OutOfMemoryError.”); //……}taskmanager.jvm-exit-on-oom配置默认为false，用于指定当task线程抛出OutOfMemoryError的时候，是否需要kill掉TaskManagerTaskManagerConfigurationflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/taskexecutor/TaskManagerConfiguration.javapublic class TaskManagerConfiguration implements TaskManagerRuntimeInfo { private static final Logger LOG = LoggerFactory.getLogger(TaskManagerConfiguration.class); private final int numberSlots; private final String[] tmpDirectories; private final Time timeout; // null indicates an infinite duration @Nullable private final Time maxRegistrationDuration; private final Time initialRegistrationPause; private final Time maxRegistrationPause; private final Time refusedRegistrationPause; private final UnmodifiableConfiguration configuration; private final boolean exitJvmOnOutOfMemory; private final FlinkUserCodeClassLoaders.ResolveOrder classLoaderResolveOrder; private final String[] alwaysParentFirstLoaderPatterns; @Nullable private final String taskManagerLogPath; @Nullable private final String taskManagerStdoutPath; public TaskManagerConfiguration( int numberSlots, String[] tmpDirectories, Time timeout, @Nullable Time maxRegistrationDuration, Time initialRegistrationPause, Time maxRegistrationPause, Time refusedRegistrationPause, Configuration configuration, boolean exitJvmOnOutOfMemory, FlinkUserCodeClassLoaders.ResolveOrder classLoaderResolveOrder, String[] alwaysParentFirstLoaderPatterns, @Nullable String taskManagerLogPath, @Nullable String taskManagerStdoutPath) { this.numberSlots = numberSlots; this.tmpDirectories = Preconditions.checkNotNull(tmpDirectories); this.timeout = Preconditions.checkNotNull(timeout); this.maxRegistrationDuration = maxRegistrationDuration; this.initialRegistrationPause = Preconditions.checkNotNull(initialRegistrationPause); this.maxRegistrationPause = Preconditions.checkNotNull(maxRegistrationPause); this.refusedRegistrationPause = Preconditions.checkNotNull(refusedRegistrationPause); this.configuration = new UnmodifiableConfiguration(Preconditions.checkNotNull(configuration)); this.exitJvmOnOutOfMemory = exitJvmOnOutOfMemory; this.classLoaderResolveOrder = classLoaderResolveOrder; this.alwaysParentFirstLoaderPatterns = alwaysParentFirstLoaderPatterns; this.taskManagerLogPath = taskManagerLogPath; this.taskManagerStdoutPath = taskManagerStdoutPath; } public int getNumberSlots() { return numberSlots; } public Time getTimeout() { return timeout; } @Nullable public Time getMaxRegistrationDuration() { return maxRegistrationDuration; } public Time getInitialRegistrationPause() { return initialRegistrationPause; } @Nullable public Time getMaxRegistrationPause() { return maxRegistrationPause; } public Time getRefusedRegistrationPause() { return refusedRegistrationPause; } @Override public Configuration getConfiguration() { return configuration; } @Override public String[] getTmpDirectories() { return tmpDirectories; } @Override public boolean shouldExitJvmOnOutOfMemoryError() { return exitJvmOnOutOfMemory; } public FlinkUserCodeClassLoaders.ResolveOrder getClassLoaderResolveOrder() { return classLoaderResolveOrder; } public String[] getAlwaysParentFirstLoaderPatterns() { return alwaysParentFirstLoaderPatterns; } @Nullable public String getTaskManagerLogPath() { return taskManagerLogPath; } @Nullable public String getTaskManagerStdoutPath() { return taskManagerStdoutPath; } // ——————————————————————————————– // Static factory methods // ——————————————————————————————– public static TaskManagerConfiguration fromConfiguration(Configuration configuration) { int numberSlots = configuration.getInteger(TaskManagerOptions.NUM_TASK_SLOTS, 1); if (numberSlots == -1) { numberSlots = 1; } final String[] tmpDirPaths = ConfigurationUtils.parseTempDirectories(configuration); final Time timeout; try { timeout = Time.milliseconds(AkkaUtils.getTimeout(configuration).toMillis()); } catch (Exception e) { throw new IllegalArgumentException( “Invalid format for ‘” + AkkaOptions.ASK_TIMEOUT.key() + “’.Use formats like ‘50 s’ or ‘1 min’ to specify the timeout.”); } LOG.info(“Messages have a max timeout of " + timeout); final Time finiteRegistrationDuration; try { Duration maxRegistrationDuration = Duration.create(configuration.getString(TaskManagerOptions.REGISTRATION_TIMEOUT)); if (maxRegistrationDuration.isFinite()) { finiteRegistrationDuration = Time.milliseconds(maxRegistrationDuration.toMillis()); } else { finiteRegistrationDuration = null; } } catch (NumberFormatException e) { throw new IllegalArgumentException(“Invalid format for parameter " + TaskManagerOptions.REGISTRATION_TIMEOUT.key(), e); } final Time initialRegistrationPause; try { Duration pause = Duration.create(configuration.getString(TaskManagerOptions.INITIAL_REGISTRATION_BACKOFF)); if (pause.isFinite()) { initialRegistrationPause = Time.milliseconds(pause.toMillis()); } else { throw new IllegalArgumentException(“The initial registration pause must be finite: " + pause); } } catch (NumberFormatException e) { throw new IllegalArgumentException(“Invalid format for parameter " + TaskManagerOptions.INITIAL_REGISTRATION_BACKOFF.key(), e); } final Time maxRegistrationPause; try { Duration pause = Duration.create(configuration.getString( TaskManagerOptions.REGISTRATION_MAX_BACKOFF)); if (pause.isFinite()) { maxRegistrationPause = Time.milliseconds(pause.toMillis()); } else { throw new IllegalArgumentException(“The maximum registration pause must be finite: " + pause); } } catch (NumberFormatException e) { throw new IllegalArgumentException(“Invalid format for parameter " + TaskManagerOptions.INITIAL_REGISTRATION_BACKOFF.key(), e); } final Time refusedRegistrationPause; try { Duration pause = Duration.create(configuration.getString(TaskManagerOptions.REFUSED_REGISTRATION_BACKOFF)); if (pause.isFinite()) { refusedRegistrationPause = Time.milliseconds(pause.toMillis()); } else { throw new IllegalArgumentException(“The refused registration pause must be finite: " + pause); } } catch (NumberFormatException e) { throw new IllegalArgumentException(“Invalid format for parameter " + TaskManagerOptions.INITIAL_REGISTRATION_BACKOFF.key(), e); } final boolean exitOnOom = configuration.getBoolean(TaskManagerOptions.KILL_ON_OUT_OF_MEMORY); final String classLoaderResolveOrder = configuration.getString(CoreOptions.CLASSLOADER_RESOLVE_ORDER); final String[] alwaysParentFirstLoaderPatterns = CoreOptions.getParentFirstLoaderPatterns(configuration); final String taskManagerLogPath = configuration.getString(ConfigConstants.TASK_MANAGER_LOG_PATH_KEY, System.getProperty(“log.file”)); final String taskManagerStdoutPath; if (taskManagerLogPath != null) { final int extension = taskManagerLogPath.lastIndexOf(’.’); if (extension > 0) { taskManagerStdoutPath = taskManagerLogPath.substring(0, extension) + “.out”; } else { taskManagerStdoutPath = null; } } else { taskManagerStdoutPath = null; } return new TaskManagerConfiguration( numberSlots, tmpDirPaths, timeout, finiteRegistrationDuration, initialRegistrationPause, maxRegistrationPause, refusedRegistrationPause, configuration, exitOnOom, FlinkUserCodeClassLoaders.ResolveOrder.fromString(classLoaderResolveOrder), alwaysParentFirstLoaderPatterns, taskManagerLogPath, taskManagerStdoutPath); }}TaskManagerConfiguration的静态方法fromConfiguration通过configuration.getBoolean(TaskManagerOptions.KILL_ON_OUT_OF_MEMORY)读取exitOnOom，然后传到构造器中的exitJvmOnOutOfMemory属性；同时提供了shouldExitJvmOnOutOfMemoryError方法来读取exitJvmOnOutOfMemory属性Taskflink-1.7.2/flink-runtime/src/main/java/org/apache/flink/runtime/taskmanager/Task.javapublic class Task implements Runnable, TaskActions, CheckpointListener { //…… @Override public void run() { // —————————- // Initial State transition // —————————- //…… // all resource acquisitions and registrations from here on // need to be undone in the end Map<String, Future<Path>> distributedCacheEntries = new HashMap<>(); AbstractInvokable invokable = null; try { //…… // —————————————————————- // call the user code initialization methods // —————————————————————- TaskKvStateRegistry kvStateRegistry = network.createKvStateTaskRegistry(jobId, getJobVertexId()); Environment env = new RuntimeEnvironment( jobId, vertexId, executionId, executionConfig, taskInfo, jobConfiguration, taskConfiguration, userCodeClassLoader, memoryManager, ioManager, broadcastVariableManager, taskStateManager, accumulatorRegistry, kvStateRegistry, inputSplitProvider, distributedCacheEntries, producedPartitions, inputGates, network.getTaskEventDispatcher(), checkpointResponder, taskManagerConfig, metrics, this); // now load and instantiate the task’s invokable code invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass, env); // —————————————————————- // actual task core work // —————————————————————- // we must make strictly sure that the invokable is accessible to the cancel() call // by the time we switched to running. this.invokable = invokable; // switch to the RUNNING state, if that fails, we have been canceled/failed in the meantime if (!transitionState(ExecutionState.DEPLOYING, ExecutionState.RUNNING)) { throw new CancelTaskException(); } // notify everyone that we switched to running taskManagerActions.updateTaskExecutionState(new TaskExecutionState(jobId, executionId, ExecutionState.RUNNING)); // make sure the user code classloader is accessible thread-locally executingThread.setContextClassLoader(userCodeClassLoader); // run the invokable invokable.invoke(); // make sure, we enter the catch block if the task leaves the invoke() method due // to the fact that it has been canceled if (isCanceledOrFailed()) { throw new CancelTaskException(); } // —————————————————————- // finalization of a successful execution // —————————————————————- // finish the produced partitions. if this fails, we consider the execution failed. for (ResultPartition partition : producedPartitions) { if (partition != null) { partition.finish(); } } // try to mark the task as finished // if that fails, the task was canceled/failed in the meantime if (!transitionState(ExecutionState.RUNNING, ExecutionState.FINISHED)) { throw new CancelTaskException(); } } catch (Throwable t) { // unwrap wrapped exceptions to make stack traces more compact if (t instanceof WrappingRuntimeException) { t = ((WrappingRuntimeException) t).unwrap(); } // —————————————————————- // the execution failed. either the invokable code properly failed, or // an exception was thrown as a side effect of cancelling // —————————————————————- try { // check if the exception is unrecoverable if (ExceptionUtils.isJvmFatalError(t) || (t instanceof OutOfMemoryError && taskManagerConfig.shouldExitJvmOnOutOfMemoryError())) { // terminate the JVM immediately // don’t attempt a clean shutdown, because we cannot expect the clean shutdown to complete try { LOG.error(“Encountered fatal error {} - terminating the JVM”, t.getClass().getName(), t); } finally { Runtime.getRuntime().halt(-1); } } // transition into our final state. we should be either in DEPLOYING, RUNNING, CANCELING, or FAILED // loop for multiple retries during concurrent state changes via calls to cancel() or // to failExternally() while (true) { ExecutionState current = this.executionState; if (current == ExecutionState.RUNNING || current == ExecutionState.DEPLOYING) { if (t instanceof CancelTaskException) { if (transitionState(current, ExecutionState.CANCELED)) { cancelInvokable(invokable); break; } } else { if (transitionState(current, ExecutionState.FAILED, t)) { // proper failure of the task. record the exception as the root cause failureCause = t; cancelInvokable(invokable); break; } } } else if (current == ExecutionState.CANCELING) { if (transitionState(current, ExecutionState.CANCELED)) { break; } } else if (current == ExecutionState.FAILED) { // in state failed already, no transition necessary any more break; } // unexpected state, go to failed else if (transitionState(current, ExecutionState.FAILED, t)) { LOG.error(“Unexpected state in task {} ({}) during an exception: {}.”, taskNameWithSubtask, executionId, current); break; } // else fall through the loop and } } catch (Throwable tt) { String message = String.format(“FATAL - exception in exception handler of task %s (%s).”, taskNameWithSubtask, executionId); LOG.error(message, tt); notifyFatalError(message, tt); } } finally { //…… } } //……}Task实现了Runnable接口，其run方法对invokable.invoke()进行了try catch，在catch的时候会判断，如果是ExceptionUtils.isJvmFatalError(t)或者(t instanceof OutOfMemoryError && taskManagerConfig.shouldExitJvmOnOutOfMemoryError())，则会调用Runtime.getRuntime().halt(-1)来停止JVMExceptionUtils.isJvmFatalErrorflink-1.7.2/flink-core/src/main/java/org/apache/flink/util/ExceptionUtils.java@Internalpublic final class ExceptionUtils { //…… /* * Checks whether the given exception indicates a situation that may leave the * JVM in a corrupted state, meaning a state where continued normal operation can only be * guaranteed via clean process restart. * * Currently considered fatal exceptions are Virtual Machine errors indicating * that the JVM is corrupted, like {@link InternalError}, {@link UnknownError}, * and {@link java.util.zip.ZipError} (a special case of InternalError). * The {@link ThreadDeath} exception is also treated as a fatal error, because when * a thread is forcefully stopped, there is a high chance that parts of the system * are in an inconsistent state. * * @param t The exception to check. * @return True, if the exception is considered fatal to the JVM, false otherwise. / public static boolean isJvmFatalError(Throwable t) { return (t instanceof InternalError) || (t instanceof UnknownError) || (t instanceof ThreadDeath); } //……}isJvmFatalError方法判断Throwable是否是InternalError或者UnknownError或者ThreadDeath，如果是则返回trueRuntime.getRuntime().haltjava.base/java/lang/Runtime.javapublic class Runtime { //…… private static final Runtime currentRuntime = new Runtime(); /* * Returns the runtime object associated with the current Java application. * Most of the methods of class {@code Runtime} are instance * methods and must be invoked with respect to the current runtime object. * * @return the {@code Runtime} object associated with the current * Java application. / public static Runtime getRuntime() { return currentRuntime; } /* * Forcibly terminates the currently running Java virtual machine. This * method never returns normally. * * This method should be used with extreme caution. Unlike the * {@link #exit exit} method, this method does not cause shutdown * hooks to be started. If the shutdown sequence has already been * initiated then this method does not wait for any running * shutdown hooks to finish their work. * * @param status * Termination status. By convention, a nonzero status code * indicates abnormal termination. If the {@link Runtime#exit exit} * (equivalently, {@link System#exit(int) System.exit}) method * has already been invoked then this status code * will override the status code passed to that method. * * @throws SecurityException * If a security manager is present and its * {@link SecurityManager#checkExit checkExit} method * does not permit an exit with the specified status * * @see #exit * @see #addShutdownHook * @see #removeShutdownHook * @since 1.3 */ public void halt(int status) { SecurityManager sm = System.getSecurityManager(); if (sm != null) { sm.checkExit(status); } Shutdown.beforeHalt(); Shutdown.halt(status); } //……}halt方法在SecurityManager不为null是会调用SecurityManager.checkExit；然后调用Shutdown.beforeHalt()以及Shutdown.halt(status)来停止JVM小结taskmanager.jvm-exit-on-oom配置默认为false，用于指定当task线程抛出OutOfMemoryError的时候，是否需要kill掉TaskManagerTaskManagerConfiguration的静态方法fromConfiguration通过configuration.getBoolean(TaskManagerOptions.KILL_ON_OUT_OF_MEMORY)读取exitOnOom，然后传到构造器中的exitJvmOnOutOfMemory属性；同时提供了shouldExitJvmOnOutOfMemoryError方法来读取exitJvmOnOutOfMemory属性Task实现了Runnable接口，其run方法对invokable.invoke()进行了try catch，在catch的时候会判断，如果是ExceptionUtils.isJvmFatalError(t)或者(t instanceof OutOfMemoryError && taskManagerConfig.shouldExitJvmOnOutOfMemoryError())，则会调用Runtime.getRuntime().halt(-1)来停止JVM；isJvmFatalError方法判断Throwable是否是InternalError或者UnknownError或者ThreadDeath，如果是则返回true；halt方法在SecurityManager不为null是会调用SecurityManager.checkExit；然后调用Shutdown.beforeHalt()以及Shutdown.halt(status)来停止JVMdoctaskmanager.jvm-exit-on-oom ...

聊聊flink的MemorySegment

序本文主要研究一下flink的MemorySegmentMemorySegmentflink-release-1.7.2/flink-core/src/main/java/org/apache/flink/core/memory/MemorySegment.java@Internalpublic abstract class MemorySegment { @SuppressWarnings(“restriction”) protected static final sun.misc.Unsafe UNSAFE = MemoryUtils.UNSAFE; @SuppressWarnings(“restriction”) protected static final long BYTE_ARRAY_BASE_OFFSET = UNSAFE.arrayBaseOffset(byte[].class); private static final boolean LITTLE_ENDIAN = (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN); // ———————————————————————— protected final byte[] heapMemory; protected long address; protected final long addressLimit; protected final int size; private final Object owner; MemorySegment(byte[] buffer, Object owner) { if (buffer == null) { throw new NullPointerException(“buffer”); } this.heapMemory = buffer; this.address = BYTE_ARRAY_BASE_OFFSET; this.size = buffer.length; this.addressLimit = this.address + this.size; this.owner = owner; } MemorySegment(long offHeapAddress, int size, Object owner) { if (offHeapAddress <= 0) { throw new IllegalArgumentException(“negative pointer or size”); } if (offHeapAddress >= Long.MAX_VALUE - Integer.MAX_VALUE) { // this is necessary to make sure the collapsed checks are safe against numeric overflows throw new IllegalArgumentException(“Segment initialized with too large address: " + offHeapAddress + " ; Max allowed address is " + (Long.MAX_VALUE - Integer.MAX_VALUE - 1)); } this.heapMemory = null; this.address = offHeapAddress; this.addressLimit = this.address + size; this.size = size; this.owner = owner; } // ———————————————————————— // Memory Segment Operations // ———————————————————————— public int size() { return size; } public boolean isFreed() { return address > addressLimit; } public void free() { // this ensures we can place no more data and trigger // the checks for the freed segment address = addressLimit + 1; } public boolean isOffHeap() { return heapMemory == null; } public byte[] getArray() { if (heapMemory != null) { return heapMemory; } else { throw new IllegalStateException(“Memory segment does not represent heap memory”); } } public long getAddress() { if (heapMemory == null) { return address; } else { throw new IllegalStateException(“Memory segment does not represent off heap memory”); } } public abstract ByteBuffer wrap(int offset, int length); public Object getOwner() { return owner; } // ———————————————————————— // Random Access get() and put() methods // ———————————————————————— //———————————————————————— // Notes on the implementation: We try to collapse as many checks as // possible. We need to obey the following rules to make this safe // against segfaults: // // - Grab mutable fields onto the stack before checking and using. This // guards us against concurrent modifications which invalidate the // pointers // - Use subtractions for range checks, as they are tolerant //———————————————————————— public abstract byte get(int index); public abstract void put(int index, byte b); public abstract void get(int index, byte[] dst); public abstract void put(int index, byte[] src); public abstract void get(int index, byte[] dst, int offset, int length); public abstract void put(int index, byte[] src, int offset, int length); public abstract boolean getBoolean(int index); public abstract void putBoolean(int index, boolean value); @SuppressWarnings(“restriction”) public final char getChar(int index) { final long pos = address + index; if (index >= 0 && pos <= addressLimit - 2) { return UNSAFE.getChar(heapMemory, pos); } else if (address > addressLimit) { throw new IllegalStateException(“This segment has been freed.”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } public final char getCharLittleEndian(int index) { if (LITTLE_ENDIAN) { return getChar(index); } else { return Character.reverseBytes(getChar(index)); } } public final char getCharBigEndian(int index) { if (LITTLE_ENDIAN) { return Character.reverseBytes(getChar(index)); } else { return getChar(index); } } @SuppressWarnings(“restriction”) public final void putChar(int index, char value) { final long pos = address + index; if (index >= 0 && pos <= addressLimit - 2) { UNSAFE.putChar(heapMemory, pos, value); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } public final void putCharLittleEndian(int index, char value) { if (LITTLE_ENDIAN) { putChar(index, value); } else { putChar(index, Character.reverseBytes(value)); } } public final void putCharBigEndian(int index, char value) { if (LITTLE_ENDIAN) { putChar(index, Character.reverseBytes(value)); } else { putChar(index, value); } } public final short getShort(int index) { final long pos = address + index; if (index >= 0 && pos <= addressLimit - 2) { return UNSAFE.getShort(heapMemory, pos); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } public final short getShortLittleEndian(int index) { if (LITTLE_ENDIAN) { return getShort(index); } else { return Short.reverseBytes(getShort(index)); } } public final short getShortBigEndian(int index) { if (LITTLE_ENDIAN) { return Short.reverseBytes(getShort(index)); } else { return getShort(index); } } public final void putShort(int index, short value) { final long pos = address + index; if (index >= 0 && pos <= addressLimit - 2) { UNSAFE.putShort(heapMemory, pos, value); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } public final void putShortLittleEndian(int index, short value) { if (LITTLE_ENDIAN) { putShort(index, value); } else { putShort(index, Short.reverseBytes(value)); } } public final void putShortBigEndian(int index, short value) { if (LITTLE_ENDIAN) { putShort(index, Short.reverseBytes(value)); } else { putShort(index, value); } } public final int getInt(int index) { final long pos = address + index; if (index >= 0 && pos <= addressLimit - 4) { return UNSAFE.getInt(heapMemory, pos); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } public final int getIntLittleEndian(int index) { if (LITTLE_ENDIAN) { return getInt(index); } else { return Integer.reverseBytes(getInt(index)); } } public final int getIntBigEndian(int index) { if (LITTLE_ENDIAN) { return Integer.reverseBytes(getInt(index)); } else { return getInt(index); } } public final void putInt(int index, int value) { final long pos = address + index; if (index >= 0 && pos <= addressLimit - 4) { UNSAFE.putInt(heapMemory, pos, value); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } public final void putIntLittleEndian(int index, int value) { if (LITTLE_ENDIAN) { putInt(index, value); } else { putInt(index, Integer.reverseBytes(value)); } } public final void putIntBigEndian(int index, int value) { if (LITTLE_ENDIAN) { putInt(index, Integer.reverseBytes(value)); } else { putInt(index, value); } } public final long getLong(int index) { final long pos = address + index; if (index >= 0 && pos <= addressLimit - 8) { return UNSAFE.getLong(heapMemory, pos); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } public final long getLongLittleEndian(int index) { if (LITTLE_ENDIAN) { return getLong(index); } else { return Long.reverseBytes(getLong(index)); } } public final long getLongBigEndian(int index) { if (LITTLE_ENDIAN) { return Long.reverseBytes(getLong(index)); } else { return getLong(index); } } public final void putLong(int index, long value) { final long pos = address + index; if (index >= 0 && pos <= addressLimit - 8) { UNSAFE.putLong(heapMemory, pos, value); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } public final void putLongLittleEndian(int index, long value) { if (LITTLE_ENDIAN) { putLong(index, value); } else { putLong(index, Long.reverseBytes(value)); } } public final void putLongBigEndian(int index, long value) { if (LITTLE_ENDIAN) { putLong(index, Long.reverseBytes(value)); } else { putLong(index, value); } } public final float getFloat(int index) { return Float.intBitsToFloat(getInt(index)); } public final float getFloatLittleEndian(int index) { return Float.intBitsToFloat(getIntLittleEndian(index)); } public final float getFloatBigEndian(int index) { return Float.intBitsToFloat(getIntBigEndian(index)); } public final void putFloat(int index, float value) { putInt(index, Float.floatToRawIntBits(value)); } public final void putFloatLittleEndian(int index, float value) { putIntLittleEndian(index, Float.floatToRawIntBits(value)); } public final void putFloatBigEndian(int index, float value) { putIntBigEndian(index, Float.floatToRawIntBits(value)); } public final double getDouble(int index) { return Double.longBitsToDouble(getLong(index)); } public final double getDoubleLittleEndian(int index) { return Double.longBitsToDouble(getLongLittleEndian(index)); } public final double getDoubleBigEndian(int index) { return Double.longBitsToDouble(getLongBigEndian(index)); } public final void putDouble(int index, double value) { putLong(index, Double.doubleToRawLongBits(value)); } public final void putDoubleLittleEndian(int index, double value) { putLongLittleEndian(index, Double.doubleToRawLongBits(value)); } public final void putDoubleBigEndian(int index, double value) { putLongBigEndian(index, Double.doubleToRawLongBits(value)); } // ————————————————————————- // Bulk Read and Write Methods // ————————————————————————- public abstract void get(DataOutput out, int offset, int length) throws IOException; public abstract void put(DataInput in, int offset, int length) throws IOException; public abstract void get(int offset, ByteBuffer target, int numBytes); public abstract void put(int offset, ByteBuffer source, int numBytes); public final void copyTo(int offset, MemorySegment target, int targetOffset, int numBytes) { final byte[] thisHeapRef = this.heapMemory; final byte[] otherHeapRef = target.heapMemory; final long thisPointer = this.address + offset; final long otherPointer = target.address + targetOffset; if ((numBytes | offset | targetOffset) >= 0 && thisPointer <= this.addressLimit - numBytes && otherPointer <= target.addressLimit - numBytes) { UNSAFE.copyMemory(thisHeapRef, thisPointer, otherHeapRef, otherPointer, numBytes); } else if (this.address > this.addressLimit) { throw new IllegalStateException(“this memory segment has been freed.”); } else if (target.address > target.addressLimit) { throw new IllegalStateException(“target memory segment has been freed.”); } else { throw new IndexOutOfBoundsException( String.format(“offset=%d, targetOffset=%d, numBytes=%d, address=%d, targetAddress=%d”, offset, targetOffset, numBytes, this.address, target.address)); } } // ————————————————————————- // Comparisons & Swapping // ————————————————————————- public final int compare(MemorySegment seg2, int offset1, int offset2, int len) { while (len >= 8) { long l1 = this.getLongBigEndian(offset1); long l2 = seg2.getLongBigEndian(offset2); if (l1 != l2) { return (l1 < l2) ^ (l1 < 0) ^ (l2 < 0) ? -1 : 1; } offset1 += 8; offset2 += 8; len -= 8; } while (len > 0) { int b1 = this.get(offset1) & 0xff; int b2 = seg2.get(offset2) & 0xff; int cmp = b1 - b2; if (cmp != 0) { return cmp; } offset1++; offset2++; len–; } return 0; } public final void swapBytes(byte[] tempBuffer, MemorySegment seg2, int offset1, int offset2, int len) { if ((offset1 | offset2 | len | (tempBuffer.length - len)) >= 0) { final long thisPos = this.address + offset1; final long otherPos = seg2.address + offset2; if (thisPos <= this.addressLimit - len && otherPos <= seg2.addressLimit - len) { // this -> temp buffer UNSAFE.copyMemory(this.heapMemory, thisPos, tempBuffer, BYTE_ARRAY_BASE_OFFSET, len); // other -> this UNSAFE.copyMemory(seg2.heapMemory, otherPos, this.heapMemory, thisPos, len); // temp buffer -> other UNSAFE.copyMemory(tempBuffer, BYTE_ARRAY_BASE_OFFSET, seg2.heapMemory, otherPos, len); return; } else if (this.address > this.addressLimit) { throw new IllegalStateException(“this memory segment has been freed.”); } else if (seg2.address > seg2.addressLimit) { throw new IllegalStateException(“other memory segment has been freed.”); } } // index is in fact invalid throw new IndexOutOfBoundsException( String.format(“offset1=%d, offset2=%d, len=%d, bufferSize=%d, address1=%d, address2=%d”, offset1, offset2, len, tempBuffer.length, this.address, seg2.address)); }}MemorySegment有点类似java.nio.ByteBuffer；它有一个byte[]类型的heapMemory属性；它有两个构造器，带有byte[]类型参数的构造器会将byte[]赋给heapMemory，不带byte[]类型参数的构造器则heapMemory为null；isOffHeap方法则用于判断当前的memory segment是heap还是off-heap，它根据heapMemory是否为null来判断，如果为null则是off-heap；另外提供了compare、swapBytes、copyTo方法；还显示提供了BigEndian及LittleEndian的get、put方法BigEndian的相关方法有：get/putCharBigEndian、get/putShortBigEndian、get/putIntBigEndian、get/putLongBigEndian、get/putFloatBigEndian、get/putDoubleBigEndian；LittleEndian的相关方法有：get/putCharLittleEndian、get/putShortLittleEndian、get/putIntLittleEndian、get/putLongLittleEndian、get/putFloatLittleEndian、get/putDoubleLittleEndianMemorySegment定义了free、wrap、get、put、getBoolean、putBoolean抽象方法，要求子类去实现；MemorySegment有两个子类，分别是HeapMemorySegment、HybridMemorySegmentHeapMemorySegmentflink-release-1.7.2/flink-core/src/main/java/org/apache/flink/core/memory/HeapMemorySegment.java@SuppressWarnings(“unused”)@Internalpublic final class HeapMemorySegment extends MemorySegment { private byte[] memory; HeapMemorySegment(byte[] memory) { this(memory, null); } HeapMemorySegment(byte[] memory, Object owner) { super(Objects.requireNonNull(memory), owner); this.memory = memory; } // ————————————————————————- // MemorySegment operations // ————————————————————————- @Override public void free() { super.free(); this.memory = null; } @Override public ByteBuffer wrap(int offset, int length) { try { return ByteBuffer.wrap(this.memory, offset, length); } catch (NullPointerException e) { throw new IllegalStateException(“segment has been freed”); } } public byte[] getArray() { return this.heapMemory; } // ———————————————————————— // Random Access get() and put() methods // ———————————————————————— @Override public final byte get(int index) { return this.memory[index]; } @Override public final void put(int index, byte b) { this.memory[index] = b; } @Override public final void get(int index, byte[] dst) { get(index, dst, 0, dst.length); } @Override public final void put(int index, byte[] src) { put(index, src, 0, src.length); } @Override public final void get(int index, byte[] dst, int offset, int length) { // system arraycopy does the boundary checks anyways, no need to check extra System.arraycopy(this.memory, index, dst, offset, length); } @Override public final void put(int index, byte[] src, int offset, int length) { // system arraycopy does the boundary checks anyways, no need to check extra System.arraycopy(src, offset, this.memory, index, length); } @Override public final boolean getBoolean(int index) { return this.memory[index] != 0; } @Override public final void putBoolean(int index, boolean value) { this.memory[index] = (byte) (value ? 1 : 0); } // ————————————————————————- // Bulk Read and Write Methods // ————————————————————————- @Override public final void get(DataOutput out, int offset, int length) throws IOException { out.write(this.memory, offset, length); } @Override public final void put(DataInput in, int offset, int length) throws IOException { in.readFully(this.memory, offset, length); } @Override public final void get(int offset, ByteBuffer target, int numBytes) { // ByteBuffer performs the boundary checks target.put(this.memory, offset, numBytes); } @Override public final void put(int offset, ByteBuffer source, int numBytes) { // ByteBuffer performs the boundary checks source.get(this.memory, offset, numBytes); } // ————————————————————————- // Factoring // ————————————————————————- /** * A memory segment factory that produces heap memory segments. Note that this factory does not * support to allocate off-heap memory. / public static final class HeapMemorySegmentFactory { public HeapMemorySegment wrap(byte[] memory) { return new HeapMemorySegment(memory); } public HeapMemorySegment allocateUnpooledSegment(int size, Object owner) { return new HeapMemorySegment(new byte[size], owner); } public HeapMemorySegment wrapPooledHeapMemory(byte[] memory, Object owner) { return new HeapMemorySegment(memory, owner); } /* * Prevent external instantiation. / HeapMemorySegmentFactory() {} } public static final HeapMemorySegmentFactory FACTORY = new HeapMemorySegmentFactory();}HeapMemorySegment继承了MemorySegment，它有一个byte[]的memory属性，free操作会将memory设置为null，wrap方法使用的是memory属性；它的构造器要求传入的memory不能为null，然后赋给父类的heapMemory属性及自己定义的memory属性(引用)；它还定义了HeapMemorySegmentFactory，提供了wrap、allocateUnpooledSegment、wrapPooledHeapMemory方法HybridMemorySegmentflink-release-1.7.2/flink-core/src/main/java/org/apache/flink/core/memory/HybridMemorySegment.java@Internalpublic final class HybridMemorySegment extends MemorySegment { /* * The direct byte buffer that allocated the off-heap memory. This memory segment holds a * reference to that buffer, so as long as this memory segment lives, the memory will not be * released. / private final ByteBuffer offHeapBuffer; /* * Creates a new memory segment that represents the memory backing the given direct byte buffer. * Note that the given ByteBuffer must be direct {@link java.nio.ByteBuffer#allocateDirect(int)}, * otherwise this method with throw an IllegalArgumentException. * * The owner referenced by this memory segment is null. * * @param buffer The byte buffer whose memory is represented by this memory segment. * @throws IllegalArgumentException Thrown, if the given ByteBuffer is not direct. / HybridMemorySegment(ByteBuffer buffer) { this(buffer, null); } /* * Creates a new memory segment that represents the memory backing the given direct byte buffer. * Note that the given ByteBuffer must be direct {@link java.nio.ByteBuffer#allocateDirect(int)}, * otherwise this method with throw an IllegalArgumentException. * * The memory segment references the given owner. * * @param buffer The byte buffer whose memory is represented by this memory segment. * @param owner The owner references by this memory segment. * @throws IllegalArgumentException Thrown, if the given ByteBuffer is not direct. / HybridMemorySegment(ByteBuffer buffer, Object owner) { super(checkBufferAndGetAddress(buffer), buffer.capacity(), owner); this.offHeapBuffer = buffer; } /* * Creates a new memory segment that represents the memory of the byte array. * * The owner referenced by this memory segment is null. * * @param buffer The byte array whose memory is represented by this memory segment. / HybridMemorySegment(byte[] buffer) { this(buffer, null); } /* * Creates a new memory segment that represents the memory of the byte array. * * The memory segment references the given owner. * * @param buffer The byte array whose memory is represented by this memory segment. * @param owner The owner references by this memory segment. / HybridMemorySegment(byte[] buffer, Object owner) { super(buffer, owner); this.offHeapBuffer = null; } // ————————————————————————- // MemorySegment operations // ————————————————————————- /* * Gets the buffer that owns the memory of this memory segment. * * @return The byte buffer that owns the memory of this memory segment. */ public ByteBuffer getOffHeapBuffer() { if (offHeapBuffer != null) { return offHeapBuffer; } else { throw new IllegalStateException(“Memory segment does not represent off heap memory”); } } @Override public ByteBuffer wrap(int offset, int length) { if (address <= addressLimit) { if (heapMemory != null) { return ByteBuffer.wrap(heapMemory, offset, length); } else { try { ByteBuffer wrapper = offHeapBuffer.duplicate(); wrapper.limit(offset + length); wrapper.position(offset); return wrapper; } catch (IllegalArgumentException e) { throw new IndexOutOfBoundsException(); } } } else { throw new IllegalStateException(“segment has been freed”); } } // ———————————————————————— // Random Access get() and put() methods // ———————————————————————— @Override public final byte get(int index) { final long pos = address + index; if (index >= 0 && pos < addressLimit) { return UNSAFE.getByte(heapMemory, pos); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } @Override public final void put(int index, byte b) { final long pos = address + index; if (index >= 0 && pos < addressLimit) { UNSAFE.putByte(heapMemory, pos, b); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } @Override public final void get(int index, byte[] dst) { get(index, dst, 0, dst.length); } @Override public final void put(int index, byte[] src) { put(index, src, 0, src.length); } @Override public final void get(int index, byte[] dst, int offset, int length) { // check the byte array offset and length and the status if ((offset | length | (offset + length) | (dst.length - (offset + length))) < 0) { throw new IndexOutOfBoundsException(); } final long pos = address + index; if (index >= 0 && pos <= addressLimit - length) { final long arrayAddress = BYTE_ARRAY_BASE_OFFSET + offset; UNSAFE.copyMemory(heapMemory, pos, dst, arrayAddress, length); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } @Override public final void put(int index, byte[] src, int offset, int length) { // check the byte array offset and length if ((offset | length | (offset + length) | (src.length - (offset + length))) < 0) { throw new IndexOutOfBoundsException(); } final long pos = address + index; if (index >= 0 && pos <= addressLimit - length) { final long arrayAddress = BYTE_ARRAY_BASE_OFFSET + offset; UNSAFE.copyMemory(src, arrayAddress, heapMemory, pos, length); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { // index is in fact invalid throw new IndexOutOfBoundsException(); } } @Override public final boolean getBoolean(int index) { return get(index) != 0; } @Override public final void putBoolean(int index, boolean value) { put(index, (byte) (value ? 1 : 0)); } // ————————————————————————- // Bulk Read and Write Methods // ————————————————————————- @Override public final void get(DataOutput out, int offset, int length) throws IOException { if (address <= addressLimit) { if (heapMemory != null) { out.write(heapMemory, offset, length); } else { while (length >= 8) { out.writeLong(getLongBigEndian(offset)); offset += 8; length -= 8; } while (length > 0) { out.writeByte(get(offset)); offset++; length–; } } } else { throw new IllegalStateException(“segment has been freed”); } } @Override public final void put(DataInput in, int offset, int length) throws IOException { if (address <= addressLimit) { if (heapMemory != null) { in.readFully(heapMemory, offset, length); } else { while (length >= 8) { putLongBigEndian(offset, in.readLong()); offset += 8; length -= 8; } while (length > 0) { put(offset, in.readByte()); offset++; length–; } } } else { throw new IllegalStateException(“segment has been freed”); } } @Override public final void get(int offset, ByteBuffer target, int numBytes) { // check the byte array offset and length if ((offset | numBytes | (offset + numBytes)) < 0) { throw new IndexOutOfBoundsException(); } final int targetOffset = target.position(); final int remaining = target.remaining(); if (remaining < numBytes) { throw new BufferOverflowException(); } if (target.isDirect()) { if (target.isReadOnly()) { throw new ReadOnlyBufferException(); } // copy to the target memory directly final long targetPointer = getAddress(target) + targetOffset; final long sourcePointer = address + offset; if (sourcePointer <= addressLimit - numBytes) { UNSAFE.copyMemory(heapMemory, sourcePointer, null, targetPointer, numBytes); target.position(targetOffset + numBytes); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { throw new IndexOutOfBoundsException(); } } else if (target.hasArray()) { // move directly into the byte array get(offset, target.array(), targetOffset + target.arrayOffset(), numBytes); // this must be after the get() call to ensue that the byte buffer is not // modified in case the call fails target.position(targetOffset + numBytes); } else { // neither heap buffer nor direct buffer while (target.hasRemaining()) { target.put(get(offset++)); } } } @Override public final void put(int offset, ByteBuffer source, int numBytes) { // check the byte array offset and length if ((offset | numBytes | (offset + numBytes)) < 0) { throw new IndexOutOfBoundsException(); } final int sourceOffset = source.position(); final int remaining = source.remaining(); if (remaining < numBytes) { throw new BufferUnderflowException(); } if (source.isDirect()) { // copy to the target memory directly final long sourcePointer = getAddress(source) + sourceOffset; final long targetPointer = address + offset; if (targetPointer <= addressLimit - numBytes) { UNSAFE.copyMemory(null, sourcePointer, heapMemory, targetPointer, numBytes); source.position(sourceOffset + numBytes); } else if (address > addressLimit) { throw new IllegalStateException(“segment has been freed”); } else { throw new IndexOutOfBoundsException(); } } else if (source.hasArray()) { // move directly into the byte array put(offset, source.array(), sourceOffset + source.arrayOffset(), numBytes); // this must be after the get() call to ensue that the byte buffer is not // modified in case the call fails source.position(sourceOffset + numBytes); } else { // neither heap buffer nor direct buffer while (source.hasRemaining()) { put(offset++, source.get()); } } } //……}HybridMemorySegment继承了MemorySegment，它有一个ByteBuffer类型的offHeapBuffer属性，由于父类本身已经有一个byte[]类型的heapMemory属性了，因而HybridMemorySegment管理的memory可以是on-heap的(使用带有byte[]类型参数的构造器)也可以是off-heap的(使用带有ByteBuffer类型参数的构造器)；wrap方法会判断，如果heapMemory不为null，则使用heapMemory，否则使用offHeapBuffer小结MemorySegment有点类似java.nio.ByteBuffer；它有一个byte[]类型的heapMemory属性；它有两个构造器，带有byte[]类型参数的构造器会将byte[]赋给heapMemory，不带byte[]类型参数的构造器则heapMemory为null；isOffHeap方法则用于判断当前的memory segment是heap还是off-heap，它根据heapMemory是否为null来判断，如果为null则是off-heap；另外提供了compare、swapBytes、copyTo方法；还显示提供了BigEndian及LittleEndian的get、put方法；MemorySegment定义了free、wrap、get、put、getBoolean、putBoolean抽象方法，要求子类去实现；MemorySegment有两个子类，分别是HeapMemorySegment、HybridMemorySegmentHeapMemorySegment继承了MemorySegment，它有一个byte[]的memory属性，free操作会将memory设置为null，wrap方法使用的是memory属性；它的构造器要求传入的memory不能为null，然后赋给父类的heapMemory属性及自己定义的memory属性(引用)；它还定义了HeapMemorySegmentFactory，提供了wrap、allocateUnpooledSegment、wrapPooledHeapMemory方法HybridMemorySegment继承了MemorySegment，它有一个ByteBuffer类型的offHeapBuffer属性，由于父类本身已经有一个byte[]类型的heapMemory属性了，因而HybridMemorySegment管理的memory可以是on-heap的(使用带有byte[]类型参数的构造器)也可以是off-heap的(使用带有ByteBuffer类型参数的构造器)；wrap方法会判断，如果heapMemory不为null，则使用heapMemory，否则使用offHeapBufferdocMemorySegmentHeapMemorySegmentHybridMemorySegment ...

聊聊flink的MemoryPool

序本文主要研究一下flink的MemoryPoolMemoryPoolflink-runtime_2.11-1.7.2-sources.jar!/org/apache/flink/runtime/memory/MemoryManager.java abstract static class MemoryPool { abstract int getNumberOfAvailableMemorySegments(); abstract MemorySegment allocateNewSegment(Object owner); abstract MemorySegment requestSegmentFromPool(Object owner); abstract void returnSegmentToPool(MemorySegment segment); abstract void clear(); }MemoryPool定义了getNumberOfAvailableMemorySegments、allocateNewSegment、requestSegmentFromPool、returnSegmentToPool、clear这几个抽象方法；它有HybridHeapMemoryPool、HybridOffHeapMemoryPool这两个子类HybridHeapMemoryPoolflink-runtime_2.11-1.7.2-sources.jar!/org/apache/flink/runtime/memory/MemoryManager.java static final class HybridHeapMemoryPool extends MemoryPool { /** The collection of available memory segments. / private final ArrayDeque<byte[]> availableMemory; private final int segmentSize; HybridHeapMemoryPool(int numInitialSegments, int segmentSize) { this.availableMemory = new ArrayDeque<>(numInitialSegments); this.segmentSize = segmentSize; for (int i = 0; i < numInitialSegments; i++) { this.availableMemory.add(new byte[segmentSize]); } } @Override MemorySegment allocateNewSegment(Object owner) { return MemorySegmentFactory.allocateUnpooledSegment(segmentSize, owner); } @Override MemorySegment requestSegmentFromPool(Object owner) { byte[] buf = availableMemory.remove(); return MemorySegmentFactory.wrapPooledHeapMemory(buf, owner); } @Override void returnSegmentToPool(MemorySegment segment) { if (segment.getClass() == HybridMemorySegment.class) { HybridMemorySegment heapSegment = (HybridMemorySegment) segment; availableMemory.add(heapSegment.getArray()); heapSegment.free(); } else { throw new IllegalArgumentException(“Memory segment is not a " + HybridMemorySegment.class.getSimpleName()); } } @Override protected int getNumberOfAvailableMemorySegments() { return availableMemory.size(); } @Override void clear() { availableMemory.clear(); } }HybridHeapMemoryPool继承了MemoryPool，它使用的是jvm的heap内存；构造器接收numInitialSegments、segmentSize两个参数用于初始化availableMemory这个ArrayDeque，该queue的元素类型为byte[]allocateNewSegment方法调用的是MemorySegmentFactory.allocateUnpooledSegment，用于分配unpooled memory；requestSegmentFromPool方法调用的是availableMemory.remove()，然后调用MemorySegmentFactory.wrapPooledHeapMemory包装为MemorySegment，这个方法没有判断ArrayDeque的大小就直接remove，需要注意returnSegmentToPool方法只对HybridMemorySegment类型进行处理，首先将它的byte[]归还到availableMemory，之后调用heapSegment.free()释放；getNumberOfAvailableMemorySegments方法返回的是availableMemory.size()；clear方法调用的是availableMemory.clear()HybridOffHeapMemoryPoolflink-runtime_2.11-1.7.2-sources.jar!/org/apache/flink/runtime/memory/MemoryManager.java static final class HybridOffHeapMemoryPool extends MemoryPool { /* The collection of available memory segments. */ private final ArrayDeque<ByteBuffer> availableMemory; private final int segmentSize; HybridOffHeapMemoryPool(int numInitialSegments, int segmentSize) { this.availableMemory = new ArrayDeque<>(numInitialSegments); this.segmentSize = segmentSize; for (int i = 0; i < numInitialSegments; i++) { this.availableMemory.add(ByteBuffer.allocateDirect(segmentSize)); } } @Override MemorySegment allocateNewSegment(Object owner) { return MemorySegmentFactory.allocateUnpooledOffHeapMemory(segmentSize, owner); } @Override MemorySegment requestSegmentFromPool(Object owner) { ByteBuffer buf = availableMemory.remove(); return MemorySegmentFactory.wrapPooledOffHeapMemory(buf, owner); } @Override void returnSegmentToPool(MemorySegment segment) { if (segment.getClass() == HybridMemorySegment.class) { HybridMemorySegment hybridSegment = (HybridMemorySegment) segment; ByteBuffer buf = hybridSegment.getOffHeapBuffer(); availableMemory.add(buf); hybridSegment.free(); } else { throw new IllegalArgumentException(“Memory segment is not a " + HybridMemorySegment.class.getSimpleName()); } } @Override protected int getNumberOfAvailableMemorySegments() { return availableMemory.size(); } @Override void clear() { availableMemory.clear(); } }HybridOffHeapMemoryPool继承了MemoryPool，它使用的是OffHeap；构造器接收numInitialSegments、segmentSize两个参数用于初始化availableMemory这个ArrayDeque，该queue的元素类型为ByteBufferallocateNewSegment方法调用的是MemorySegmentFactory.allocateUnpooledOffHeapMemory，用于分配unpooled off-heap memory；requestSegmentFromPool方法调用的是availableMemory.remove()，然后调用MemorySegmentFactory.wrapPooledOffHeapMemory包装为MemorySegment，这个方法没有判断ArrayDeque的大小就直接remove，需要注意returnSegmentToPool方法只对HybridMemorySegment类型进行处理，首先将它的ByteBuffer归还到availableMemory，之后调用heapSegment.free()释放；getNumberOfAvailableMemorySegments方法返回的是availableMemory.size()；clear方法调用的是availableMemory.clear()小结MemoryPool定义了getNumberOfAvailableMemorySegments、allocateNewSegment、requestSegmentFromPool、returnSegmentToPool、clear这几个抽象方法；它有HybridHeapMemoryPool、HybridOffHeapMemoryPool这两个子类HybridHeapMemoryPool继承了MemoryPool，它使用的是jvm的heap内存；构造器接收numInitialSegments、segmentSize两个参数用于初始化availableMemory这个ArrayDeque，该queue的元素类型为byte[]；allocateNewSegment方法调用的是MemorySegmentFactory.allocateUnpooledSegment，用于分配unpooled memory；requestSegmentFromPool方法调用的是availableMemory.remove()，然后调用MemorySegmentFactory.wrapPooledHeapMemory包装为MemorySegment，这个方法没有判断ArrayDeque的大小就直接remove，需要注意；returnSegmentToPool方法只对HybridMemorySegment类型进行处理，首先将它的byte[]归还到availableMemory，之后调用heapSegment.free()释放；getNumberOfAvailableMemorySegments方法返回的是availableMemory.size()；clear方法调用的是availableMemory.clear()HybridOffHeapMemoryPool继承了MemoryPool，它使用的是OffHeap；构造器接收numInitialSegments、segmentSize两个参数用于初始化availableMemory这个ArrayDeque，该queue的元素类型为ByteBuffer；allocateNewSegment方法调用的是MemorySegmentFactory.allocateUnpooledOffHeapMemory，用于分配unpooled off-heap memory；requestSegmentFromPool方法调用的是availableMemory.remove()，然后调用MemorySegmentFactory.wrapPooledOffHeapMemory包装为MemorySegment，这个方法没有判断ArrayDeque的大小就直接remove，需要注意；returnSegmentToPool方法只对HybridMemorySegment类型进行处理，首先将它的ByteBuffer归还到availableMemory，之后调用heapSegment.free()释放；getNumberOfAvailableMemorySegments方法返回的是availableMemory.size()；clear方法调用的是availableMemory.clear()docMemoryManager ...

Stream Processing With Flink （7）状态算子和用户函数

状态函数的实现状态函数通过运行上下文存储和访问状态键状态类似于分布式Map 每个状态函数实例维护一段范围的键状态使用键状态的状态函数必须应用于KeyedStream（已按键分区后的流）键状态类型包括单值列表 Map和聚合类型1.1 在RuntimeContext中定义键状态（keyed State）static class StateMachineMapper extends RichFlatMapFunction<Event, Alert> { /** The state for the current key. / private ValueState<Integer> currentState; @Override public void open(Configuration conf) { // get access to the state object currentState = getRuntimeContext().getState(new ValueStateDescriptor<>(“state”, Integer.class)); } @Override public void flatMap(Event evt, Collector<Alert> out) throws Exception { // get the current state for the key (source address) // if no state exists, yet, the state must be the state machine’s initial state Integer state = currentState.value(); if(state==null){ currentState.update(1); }else { System.out.println(“key: “+evt.sourceAddress()+” state："+state); currentState.update(state + 1); } }}1.2 在用户函数中实现算子状态算子状态（operator state）维护在每个单独的算子实例中算子状态包括List State,List Union State和BroadCast State用户函数通过实现ListCheckpointed接口来操作List State算子状态static class StateMachineMapper extends RichFlatMapFunction<Event, Alert> implements ListCheckpointed<Integer> { /* The state for the current key. */ private Integer currentState=0; @Override public void flatMap(Event evt, Collector<Alert> out) throws Exception { // get the current state for the key (source address) // if no state exists, yet, the state must be the state machine’s initial state System.out.println(currentState); currentState=currentState+1; }//Flink运行检查点时会执行该方法对状态进行存储 @Override public List<Integer> snapshotState(long checkpointId, long timestamp) throws Exception { return Lists.newArrayList(currentState); }//当作业启动或失败时会执行该方法用于状态的初始化 @Override public void restoreState(List<Integer> state) throws Exception { currentState=state.get(0); }}算子状态类型为List结构是用于应对状态算子并行度的改变当增加或减少状态算子并行度时那算子状态就需要在并行实例中进行重分配这需要要求能够合并或分割算子状态Broadcast State算子状态是能够在所有状态算子间共享的状态用户函数通过继承CheckpointedFunction接口可同时操作键状态和算子状态用户函数通过继承CheckpointListener接口获取所有状态算子完成将其状态回写远程存储的通知2.状态应用的鲁棒和性能状态后端和检查点算法的选择影响状态应用的鲁棒和性能2.1 状态后端（state backend)状态后端负责维护每个算子实例的状态且当检查点运行时负责将状态发送给远程持久化存储设备状态后端是插件化实现的 Flink提供三种状态后端实现包括基于内存基于磁盘和基于RocksDBStateBackend是用于实现用户自定义状态后端的接口//配置RocksDBStateBackend为Flink应用的状态后端final String checkpointDir = params.get(“checkpoint-dir”);boolean incrementalCheckpoints = params.getBoolean(“incremental-checkpoints”, false);env.setStateBackend(new RocksDBStateBackend(checkpointDir, incrementalCheckpoints));2.2 检查点（Checkpointing）开启流应用失败不应该影响计算正确性流应用失败不应该丢失状态因为其可能是不可恢复的检查点机制指的是在流应用运行的某个时间点对应用中所有内置状态和状态函数进行快照检查点机制和状态恢复机制保证对流应用的状态的有且仅有一次的一致性保证检查点开启需要设置一个运行周期决定正常流处理中检查点运行的开销和失败后恢复的时间val env = StreamExecutionEnvironment.getExecutionEnvironment// set checkpointing interval to 10 seconds (10000 milliseconds)env.enableCheckpointing(10000L)2.3 状态算子的更新保存点（savepoint）机制保证不会因更新状态算子而停止的应用在重启时丢失状态2.4 调节状态应用性能2.5 避免状态泄漏3. 可查询状态（Queryable State）键状态可以以只读的键值形式暴露给外部系统3.1 可查询状态服务构成QueryableStateClient 供外部系统使用的访问键状态的客户端QueryableStateClientProxy 接受和响应客户端请求每个TM运行一个该实例因为键状态分布于所有算子实例代理需要实现键对应的状态状态维护于哪个TM中该信息维护于JM中QueryableStateServer 对ClientProxy请求发起响应每个TM运行一个该实例用于访问本地状态后端的键状态3.2 可查询状态的暴露在open方法中为键状态设置可查询状态override def open(parameters: Configuration): Unit = { // create state descriptor val lastTempDescriptor = new ValueStateDescriptor[Double](“lastTemp”, classOf[Double]) // enable queryable state and set its external identifier lastTempDescriptor.setQueryable(“lastTemperature”) // obtain the state handle lastTempState = getRuntimeContext .getStateDouble}将流写入一个可查询状态的sinktenSecsMaxTemps .keyBy(_._1) .asQueryableState(“maxTemperature”)3.3 从外部系统访问可查询状态通过引入依赖来获取QueryableStateClient相关代码<dependency> <groupid>org.apache.flink</groupid> <artifactid>flink-queryable-state-client-java_2.11</artifactid> <version>1.5.0</version></dependency>创建访问可查询状态的客户端//tmHostName是任意TM的IP地址 val client: QueryableStateClient = new QueryableStateClient(tmHostname, proxyPort)

聊聊flink TaskManager的offHeap

序本文主要研究一下flink TaskManager的offHeapTaskManagerOptionsflink-core-1.7.2-sources.jar!/org/apache/flink/configuration/TaskManagerOptions.java@PublicEvolvingpublic class TaskManagerOptions { //…… /** * JVM heap size for the TaskManagers with memory size. / @Documentation.CommonOption(position = Documentation.CommonOption.POSITION_MEMORY) public static final ConfigOption<String> TASK_MANAGER_HEAP_MEMORY = key(“taskmanager.heap.size”) .defaultValue(“1024m”) .withDescription(“JVM heap size for the TaskManagers, which are the parallel workers of” + " the system. On YARN setups, this value is automatically configured to the size of the TaskManager’s" + " YARN container, minus a certain tolerance value."); /* * Amount of memory to be allocated by the task manager’s memory manager. If not * set, a relative fraction will be allocated, as defined by {@link #MANAGED_MEMORY_FRACTION}. / public static final ConfigOption<String> MANAGED_MEMORY_SIZE = key(“taskmanager.memory.size”) .defaultValue(“0”) .withDescription(“Amount of memory to be allocated by the task manager’s memory manager.” + " If not set, a relative fraction will be allocated."); /* * Fraction of free memory allocated by the memory manager if {@link #MANAGED_MEMORY_SIZE} is * not set. / public static final ConfigOption<Float> MANAGED_MEMORY_FRACTION = key(“taskmanager.memory.fraction”) .defaultValue(0.7f) .withDescription(“The relative amount of memory (after subtracting the amount of memory used by network” + " buffers) that the task manager reserves for sorting, hash tables, and caching of intermediate results." + " For example, a value of 0.8 means that a task manager reserves 80% of its memory" + " for internal data buffers, leaving 20% of free memory for the task manager’s heap for objects" + " created by user-defined functions. This parameter is only evaluated, if " + MANAGED_MEMORY_SIZE.key() + " is not set."); /* * Memory allocation method (JVM heap or off-heap), used for managed memory of the TaskManager * as well as the network buffers. / public static final ConfigOption<Boolean> MEMORY_OFF_HEAP = key(“taskmanager.memory.off-heap”) .defaultValue(false) .withDescription(“Memory allocation method (JVM heap or off-heap), used for managed memory of the” + " TaskManager as well as the network buffers."); / * Whether TaskManager managed memory should be pre-allocated when the TaskManager is starting. / public static final ConfigOption<Boolean> MANAGED_MEMORY_PRE_ALLOCATE = key(“taskmanager.memory.preallocate”) .defaultValue(false) .withDescription(“Whether TaskManager managed memory should be pre-allocated when the TaskManager is starting.”); //……}taskmanager.memory.size设置的是由task manager memory manager管理的内存大小(基于offHeap，主要用于sorting,hashing及caching)，默认为0TaskManagerServices.calculateHeapSizeMBflink-runtime_2.11-1.7.2-sources.jar!/org/apache/flink/runtime/taskexecutor/TaskManagerServices.javapublic class TaskManagerServices { //…… /* * Calculates the amount of heap memory to use (to set via <tt>-Xmx</tt> and <tt>-Xms</tt>) * based on the total memory to use and the given configuration parameters. * * @param totalJavaMemorySizeMB * overall available memory to use (heap and off-heap) * @param config * configuration object * * @return heap memory to use (in megabytes) / public static long calculateHeapSizeMB(long totalJavaMemorySizeMB, Configuration config) { Preconditions.checkArgument(totalJavaMemorySizeMB > 0); // subtract the Java memory used for network buffers (always off-heap) final long networkBufMB = calculateNetworkBufferMemory( totalJavaMemorySizeMB << 20, // megabytes to bytes config) >> 20; // bytes to megabytes final long remainingJavaMemorySizeMB = totalJavaMemorySizeMB - networkBufMB; // split the available Java memory between heap and off-heap final boolean useOffHeap = config.getBoolean(TaskManagerOptions.MEMORY_OFF_HEAP); final long heapSizeMB; if (useOffHeap) { long offHeapSize; String managedMemorySizeDefaultVal = TaskManagerOptions.MANAGED_MEMORY_SIZE.defaultValue(); if (!config.getString(TaskManagerOptions.MANAGED_MEMORY_SIZE).equals(managedMemorySizeDefaultVal)) { try { offHeapSize = MemorySize.parse(config.getString(TaskManagerOptions.MANAGED_MEMORY_SIZE), MEGA_BYTES).getMebiBytes(); } catch (IllegalArgumentException e) { throw new IllegalConfigurationException( “Could not read " + TaskManagerOptions.MANAGED_MEMORY_SIZE.key(), e); } } else { offHeapSize = Long.valueOf(managedMemorySizeDefaultVal); } if (offHeapSize <= 0) { // calculate off-heap section via fraction double fraction = config.getFloat(TaskManagerOptions.MANAGED_MEMORY_FRACTION); offHeapSize = (long) (fraction * remainingJavaMemorySizeMB); } TaskManagerServicesConfiguration .checkConfigParameter(offHeapSize < remainingJavaMemorySizeMB, offHeapSize, TaskManagerOptions.MANAGED_MEMORY_SIZE.key(), “Managed memory size too large for " + networkBufMB + " MB network buffer memory and a total of " + totalJavaMemorySizeMB + " MB JVM memory”); heapSizeMB = remainingJavaMemorySizeMB - offHeapSize; } else { heapSizeMB = remainingJavaMemorySizeMB; } return heapSizeMB; } //……}taskmanager.memory.size值小于等于0的话，则会根据taskmanager.memory.fraction配置来分配，默认为0.7taskmanager.heap.size设置的是taskmanager的heap及offHeap的memory；而taskmanager.memory.fraction是按taskmanager.heap.size口减掉networkBufMB之后的值来算，得出的值作为task manager memory manager管理的offHeapSizetaskmanager.memory.size及taskmanager.memory.fraction只有在taskmanager.memory.off-heap值为true的时候才生效，而taskmanager.memory.fraction的值在taskmanager.memory.size为0的时候才使用；如果开启了taskmanager.memory.off-heap，则taskmanager的Xmx值为taskmanager.heap.size - networkBufMB - offHeapSizeTaskManagerServices.createMemoryManagerflink-runtime_2.11-1.7.2-sources.jar!/org/apache/flink/runtime/taskexecutor/TaskManagerServices.javapublic class TaskManagerServices { //…… /* * Creates a {@link MemoryManager} from the given {@link TaskManagerServicesConfiguration}. * * @param taskManagerServicesConfiguration to create the memory manager from * @param freeHeapMemoryWithDefrag an estimate of the size of the free heap memory * @param maxJvmHeapMemory the maximum JVM heap size * @return Memory manager * @throws Exception / private static MemoryManager createMemoryManager( TaskManagerServicesConfiguration taskManagerServicesConfiguration, long freeHeapMemoryWithDefrag, long maxJvmHeapMemory) throws Exception { // computing the amount of memory to use depends on how much memory is available // it strictly needs to happen AFTER the network stack has been initialized // check if a value has been configured long configuredMemory = taskManagerServicesConfiguration.getConfiguredMemory(); MemoryType memType = taskManagerServicesConfiguration.getMemoryType(); final long memorySize; boolean preAllocateMemory = taskManagerServicesConfiguration.isPreAllocateMemory(); if (configuredMemory > 0) { if (preAllocateMemory) { LOG.info(“Using {} MB for managed memory.” , configuredMemory); } else { LOG.info(“Limiting managed memory to {} MB, memory will be allocated lazily.” , configuredMemory); } memorySize = configuredMemory << 20; // megabytes to bytes } else { // similar to #calculateNetworkBufferMemory(TaskManagerServicesConfiguration tmConfig) float memoryFraction = taskManagerServicesConfiguration.getMemoryFraction(); if (memType == MemoryType.HEAP) { // network buffers allocated off-heap -> use memoryFraction of the available heap: long relativeMemSize = (long) (freeHeapMemoryWithDefrag * memoryFraction); if (preAllocateMemory) { LOG.info(“Using {} of the currently free heap space for managed heap memory ({} MB).” , memoryFraction , relativeMemSize >> 20); } else { LOG.info(“Limiting managed memory to {} of the currently free heap space ({} MB), " + “memory will be allocated lazily.” , memoryFraction , relativeMemSize >> 20); } memorySize = relativeMemSize; } else if (memType == MemoryType.OFF_HEAP) { // The maximum heap memory has been adjusted according to the fraction (see // calculateHeapSizeMB(long totalJavaMemorySizeMB, Configuration config)), i.e. // maxJvmHeap = jvmTotalNoNet - jvmTotalNoNet * memoryFraction = jvmTotalNoNet * (1 - memoryFraction) // directMemorySize = jvmTotalNoNet * memoryFraction long directMemorySize = (long) (maxJvmHeapMemory / (1.0 - memoryFraction) * memoryFraction); if (preAllocateMemory) { LOG.info(“Using {} of the maximum memory size for managed off-heap memory ({} MB).” , memoryFraction, directMemorySize >> 20); } else { LOG.info(“Limiting managed memory to {} of the maximum memory size ({} MB),” + " memory will be allocated lazily.”, memoryFraction, directMemorySize >> 20); } memorySize = directMemorySize; } else { throw new RuntimeException(“No supported memory type detected.”); } } // now start the memory manager final MemoryManager memoryManager; try { memoryManager = new MemoryManager( memorySize, taskManagerServicesConfiguration.getNumberOfSlots(), taskManagerServicesConfiguration.getNetworkConfig().networkBufferSize(), memType, preAllocateMemory); } catch (OutOfMemoryError e) { if (memType == MemoryType.HEAP) { throw new Exception(“OutOfMemory error (” + e.getMessage() + “) while allocating the TaskManager heap memory (” + memorySize + " bytes).”, e); } else if (memType == MemoryType.OFF_HEAP) { throw new Exception(“OutOfMemory error (” + e.getMessage() + “) while allocating the TaskManager off-heap memory (” + memorySize + " bytes).Try increasing the maximum direct memory (-XX:MaxDirectMemorySize)", e); } else { throw e; } } return memoryManager; } //……}TaskManagerServices提供了私有静态方法createMemoryManager用于根据配置创建MemoryManager；这里根据MemoryType来重新计算memorySize，然后传递给MemoryManager的构造器，创建MemoryManagerTaskManagerServicesConfigurationflink-runtime_2.11-1.7.2-sources.jar!/org/apache/flink/runtime/taskexecutor/TaskManagerServicesConfiguration.javapublic class TaskManagerServicesConfiguration { //…… /* * Utility method to extract TaskManager config parameters from the configuration and to * sanity check them. * * @param configuration The configuration. * @param remoteAddress identifying the IP address under which the TaskManager will be accessible * @param localCommunication True, to skip initializing the network stack. * Use only in cases where only one task manager runs. * @return TaskExecutorConfiguration that wrappers InstanceConnectionInfo, NetworkEnvironmentConfiguration, etc. */ public static TaskManagerServicesConfiguration fromConfiguration( Configuration configuration, InetAddress remoteAddress, boolean localCommunication) throws Exception { // we need this because many configs have been written with a “-1” entry int slots = configuration.getInteger(TaskManagerOptions.NUM_TASK_SLOTS, 1); if (slots == -1) { slots = 1; } final String[] tmpDirs = ConfigurationUtils.parseTempDirectories(configuration); String[] localStateRootDir = ConfigurationUtils.parseLocalStateDirectories(configuration); if (localStateRootDir.length == 0) { // default to temp dirs. localStateRootDir = tmpDirs; } boolean localRecoveryMode = configuration.getBoolean( CheckpointingOptions.LOCAL_RECOVERY.key(), CheckpointingOptions.LOCAL_RECOVERY.defaultValue()); final NetworkEnvironmentConfiguration networkConfig = parseNetworkEnvironmentConfiguration( configuration, localCommunication, remoteAddress, slots); final QueryableStateConfiguration queryableStateConfig = parseQueryableStateConfiguration(configuration); // extract memory settings long configuredMemory; String managedMemorySizeDefaultVal = TaskManagerOptions.MANAGED_MEMORY_SIZE.defaultValue(); if (!configuration.getString(TaskManagerOptions.MANAGED_MEMORY_SIZE).equals(managedMemorySizeDefaultVal)) { try { configuredMemory = MemorySize.parse(configuration.getString(TaskManagerOptions.MANAGED_MEMORY_SIZE), MEGA_BYTES).getMebiBytes(); } catch (IllegalArgumentException e) { throw new IllegalConfigurationException( “Could not read " + TaskManagerOptions.MANAGED_MEMORY_SIZE.key(), e); } } else { configuredMemory = Long.valueOf(managedMemorySizeDefaultVal); } checkConfigParameter( configuration.getString(TaskManagerOptions.MANAGED_MEMORY_SIZE).equals(TaskManagerOptions.MANAGED_MEMORY_SIZE.defaultValue()) || configuredMemory > 0, configuredMemory, TaskManagerOptions.MANAGED_MEMORY_SIZE.key(), “MemoryManager needs at least one MB of memory. " + “If you leave this config parameter empty, the system automatically " + “pick a fraction of the available memory.”); // check whether we use heap or off-heap memory final MemoryType memType; if (configuration.getBoolean(TaskManagerOptions.MEMORY_OFF_HEAP)) { memType = MemoryType.OFF_HEAP; } else { memType = MemoryType.HEAP; } boolean preAllocateMemory = configuration.getBoolean(TaskManagerOptions.MANAGED_MEMORY_PRE_ALLOCATE); float memoryFraction = configuration.getFloat(TaskManagerOptions.MANAGED_MEMORY_FRACTION); checkConfigParameter(memoryFraction > 0.0f && memoryFraction < 1.0f, memoryFraction, TaskManagerOptions.MANAGED_MEMORY_FRACTION.key(), “MemoryManager fraction of the free memory must be between 0.0 and 1.0”); long timerServiceShutdownTimeout = AkkaUtils.getTimeout(configuration).toMillis(); return new TaskManagerServicesConfiguration( remoteAddress, tmpDirs, localStateRootDir, localRecoveryMode, networkConfig, queryableStateConfig, slots, configuredMemory, memType, preAllocateMemory, memoryFraction, timerServiceShutdownTimeout, ConfigurationUtils.getSystemResourceMetricsProbingInterval(configuration)); } //……}TaskManagerServicesConfiguration提供了一个静态方法fromConfiguration，用于从Configuration创建TaskManagerServicesConfiguration；其中memType是依据taskmanager.memory.off-heap的配置来，如果为true则为MemoryType.OFF_HEAP，否则为MemoryType.HEAP小结taskmanager.memory.size设置的是由task manager memory manager管理的内存大小(基于offHeap，主要用于sorting,hashing及caching)，默认为0；taskmanager.memory.size值小于等于0的话，则会根据taskmanager.memory.fraction配置来分配，默认为0.7；taskmanager.heap.size设置的是taskmanager的heap及offHeap的memory；而taskmanager.memory.fraction是按taskmanager.heap.size口减掉networkBufMB之后的值来算，得出的值作为task manager memory manager管理的offHeapSizetaskmanager.memory.size及taskmanager.memory.fraction只有在taskmanager.memory.off-heap值为true的时候才生效，而taskmanager.memory.fraction的值在taskmanager.memory.size为0的时候才使用；如果开启了taskmanager.memory.off-heap，则taskmanager的Xmx值为taskmanager.heap.size - networkBufMB - offHeapSizeTaskManagerServices提供了私有静态方法createMemoryManager用于根据配置创建MemoryManager；这里根据MemoryType来重新计算memorySize，然后传递给MemoryManager的构造器，创建MemoryManager；TaskManagerServicesConfiguration提供了一个静态方法fromConfiguration，用于从Configuration创建TaskManagerServicesConfiguration；其中memType是依据taskmanager.memory.off-heap的配置来，如果为true则为MemoryType.OFF_HEAP，否则为MemoryType.HEAPdoctaskmanager-memory-size ...

聊聊flink TaskManager的memory大小设置

序本文主要研究一下flink TaskManager的memory大小设置flink-conf.yamlflink-release-1.7.2/flink-dist/src/main/resources/flink-conf.yaml# The heap size for the TaskManager JVMtaskmanager.heap.size: 1024m# The number of task slots that each TaskManager offers. Each slot runs one parallel pipeline.taskmanager.numberOfTaskSlots: 1# Specify whether TaskManager’s managed memory should be allocated when starting# up (true) or when memory is requested.## We recommend to set this value to ’true’ only in setups for pure batch# processing (DataSet API). Streaming setups currently do not use the TaskManager’s# managed memory: The ‘rocksdb’ state backend uses RocksDB’s own memory management,# while the ‘memory’ and ‘filesystem’ backends explicitly keep data as objects# to save on serialization cost.## taskmanager.memory.preallocate: false# The amount of memory going to the network stack. These numbers usually need # no tuning. Adjusting them may be necessary in case of an “Insufficient number# of network buffers” error. The default min is 64MB, teh default max is 1GB.# # taskmanager.network.memory.fraction: 0.1# taskmanager.network.memory.min: 64mb# taskmanager.network.memory.max: 1gbflink-conf.yaml提供了taskmanager.heap.size来设置taskmanager的memory(heap及offHeap)大小提供了taskmanager.memory相关配置(taskmanager.memory.fraction、taskmanager.memory.off-heap、taskmanager.memory.preallocate、taskmanager.memory.segment-size、taskmanager.memory.size)用于设置memory提供了taskmanager.network.memory相关配置(taskmanager.network.detailed-metrics、taskmanager.network.memory.buffers-per-channel、taskmanager.network.memory.floating-buffers-per-gate、taskmanager.network.memory.fraction、taskmanager.network.memory.max、taskmanager.network.memory.min)用于设置taskmanager的network stack的内存config.shflink-release-1.7.2/flink-dist/src/main/flink-bin/bin/config.sh#!/usr/bin/env bash# WARNING !!! , these values are only used if there is nothing else is specified in# conf/flink-conf.yamlDEFAULT_ENV_PID_DIR="/tmp" # Directory to store .pid files toDEFAULT_ENV_LOG_MAX=5 # Maximum number of old log files to keepDEFAULT_ENV_JAVA_OPTS="" # Optional JVM argsDEFAULT_ENV_JAVA_OPTS_JM="" # Optional JVM args (JobManager)DEFAULT_ENV_JAVA_OPTS_TM="" # Optional JVM args (TaskManager)DEFAULT_ENV_JAVA_OPTS_HS="" # Optional JVM args (HistoryServer)DEFAULT_ENV_SSH_OPTS="" # Optional SSH parameters running in cluster modeDEFAULT_YARN_CONF_DIR="" # YARN Configuration Directory, if necessaryDEFAULT_HADOOP_CONF_DIR="" # Hadoop Configuration Directory, if necessaryKEY_TASKM_MEM_SIZE=“taskmanager.heap.size"KEY_TASKM_MEM_MB=“taskmanager.heap.mb"KEY_TASKM_MEM_MANAGED_SIZE=“taskmanager.memory.size"KEY_TASKM_MEM_MANAGED_FRACTION=“taskmanager.memory.fraction"KEY_TASKM_OFFHEAP=“taskmanager.memory.off-heap"KEY_TASKM_MEM_PRE_ALLOCATE=“taskmanager.memory.preallocate"KEY_TASKM_NET_BUF_FRACTION=“taskmanager.network.memory.fraction"KEY_TASKM_NET_BUF_MIN=“taskmanager.network.memory.min"KEY_TASKM_NET_BUF_MAX=“taskmanager.network.memory.max"KEY_TASKM_NET_BUF_NR=“taskmanager.network.numberOfBuffers” # fallbackKEY_TASKM_COMPUTE_NUMA=“taskmanager.compute.numa”# Define FLINK_TM_HEAP if it is not already setif [ -z “${FLINK_TM_HEAP}” ]; then FLINK_TM_HEAP=$(readFromConfig ${KEY_TASKM_MEM_SIZE} 0 “${YAML_CONF}")fi# Try read old config key, if new key not existsif [ “${FLINK_TM_HEAP}” == 0 ]; then FLINK_TM_HEAP_MB=$(readFromConfig ${KEY_TASKM_MEM_MB} 0 “${YAML_CONF}")fi# Define FLINK_TM_MEM_MANAGED_SIZE if it is not already setif [ -z “${FLINK_TM_MEM_MANAGED_SIZE}” ]; then FLINK_TM_MEM_MANAGED_SIZE=$(readFromConfig ${KEY_TASKM_MEM_MANAGED_SIZE} 0 “${YAML_CONF}”) if hasUnit ${FLINK_TM_MEM_MANAGED_SIZE}; then FLINK_TM_MEM_MANAGED_SIZE=$(getMebiBytes $(parseBytes ${FLINK_TM_MEM_MANAGED_SIZE})) else FLINK_TM_MEM_MANAGED_SIZE=$(getMebiBytes $(parseBytes ${FLINK_TM_MEM_MANAGED_SIZE}“m”)) fifi# Define FLINK_TM_MEM_MANAGED_FRACTION if it is not already setif [ -z “${FLINK_TM_MEM_MANAGED_FRACTION}” ]; then FLINK_TM_MEM_MANAGED_FRACTION=$(readFromConfig ${KEY_TASKM_MEM_MANAGED_FRACTION} 0.7 “${YAML_CONF}")fi# Define FLINK_TM_OFFHEAP if it is not already setif [ -z “${FLINK_TM_OFFHEAP}” ]; then FLINK_TM_OFFHEAP=$(readFromConfig ${KEY_TASKM_OFFHEAP} “false” “${YAML_CONF}")fi# Define FLINK_TM_MEM_PRE_ALLOCATE if it is not already setif [ -z “${FLINK_TM_MEM_PRE_ALLOCATE}” ]; then FLINK_TM_MEM_PRE_ALLOCATE=$(readFromConfig ${KEY_TASKM_MEM_PRE_ALLOCATE} “false” “${YAML_CONF}")fi# Define FLINK_TM_NET_BUF_FRACTION if it is not already setif [ -z “${FLINK_TM_NET_BUF_FRACTION}” ]; then FLINK_TM_NET_BUF_FRACTION=$(readFromConfig ${KEY_TASKM_NET_BUF_FRACTION} 0.1 “${YAML_CONF}")fi# Define FLINK_TM_NET_BUF_MIN and FLINK_TM_NET_BUF_MAX if not already set (as a fallback)if [ -z “${FLINK_TM_NET_BUF_MIN}” -a -z “${FLINK_TM_NET_BUF_MAX}” ]; then FLINK_TM_NET_BUF_MIN=$(readFromConfig ${KEY_TASKM_NET_BUF_NR} -1 “${YAML_CONF}”) if [ $FLINK_TM_NET_BUF_MIN != -1 ]; then FLINK_TM_NET_BUF_MIN=$(parseBytes ${FLINK_TM_NET_BUF_MIN}) FLINK_TM_NET_BUF_MAX=${FLINK_TM_NET_BUF_MIN} fifi# Define FLINK_TM_NET_BUF_MIN if it is not already setif [ -z “${FLINK_TM_NET_BUF_MIN}” -o “${FLINK_TM_NET_BUF_MIN}” = “-1” ]; then # default: 64MB = 67108864 bytes (same as the previous default with 2048 buffers of 32k each) FLINK_TM_NET_BUF_MIN=$(readFromConfig ${KEY_TASKM_NET_BUF_MIN} 67108864 “${YAML_CONF}”) FLINK_TM_NET_BUF_MIN=$(parseBytes ${FLINK_TM_NET_BUF_MIN})fi# Define FLINK_TM_NET_BUF_MAX if it is not already setif [ -z “${FLINK_TM_NET_BUF_MAX}” -o “${FLINK_TM_NET_BUF_MAX}” = “-1” ]; then # default: 1GB = 1073741824 bytes FLINK_TM_NET_BUF_MAX=$(readFromConfig ${KEY_TASKM_NET_BUF_MAX} 1073741824 “${YAML_CONF}”) FLINK_TM_NET_BUF_MAX=$(parseBytes ${FLINK_TM_NET_BUF_MAX})ficonfig.sh在相关变量没有设置的前提下，初始化了FLINK_TM_HEAP、FLINK_TM_MEM_MANAGED_SIZE、FLINK_TM_MEM_MANAGED_FRACTION、FLINK_TM_OFFHEAP、FLINK_TM_MEM_PRE_ALLOCATE、FLINK_TM_NET_BUF_FRACTION等变量taskmanager.shflink-release-1.7.2/flink-dist/src/main/flink-bin/bin/taskmanager.sh#!/usr/bin/env bash# Start/stop a Flink TaskManager.USAGE=“Usage: taskmanager.sh (start|start-foreground|stop|stop-all)“STARTSTOP=$1ARGS=("${@:2}")if [[ $STARTSTOP != “start” ]] && [[ $STARTSTOP != “start-foreground” ]] && [[ $STARTSTOP != “stop” ]] && [[ $STARTSTOP != “stop-all” ]]; then echo $USAGE exit 1fibin=dirname "$0"bin=cd "$bin"; pwd. “$bin”/config.shENTRYPOINT=taskexecutorif [[ $STARTSTOP == “start” ]] || [[ $STARTSTOP == “start-foreground” ]]; then # if memory allocation mode is lazy and no other JVM options are set, # set the ‘Concurrent Mark Sweep GC’ if [[ $FLINK_TM_MEM_PRE_ALLOCATE == “false” ]] && [ -z “${FLINK_ENV_JAVA_OPTS}” ] && [ -z “${FLINK_ENV_JAVA_OPTS_TM}” ]; then export JVM_ARGS="$JVM_ARGS -XX:+UseG1GC” fi if [ ! -z “${FLINK_TM_HEAP_MB}” ] && [ “${FLINK_TM_HEAP}” == 0 ]; then echo “used deprecated key `${KEY_TASKM_MEM_MB}`, please replace with key `${KEY_TASKM_MEM_SIZE}`” else flink_tm_heap_bytes=$(parseBytes ${FLINK_TM_HEAP}) FLINK_TM_HEAP_MB=$(getMebiBytes ${flink_tm_heap_bytes}) fi if [[ ! ${FLINK_TM_HEAP_MB} =~ ${IS_NUMBER} ]] || [[ “${FLINK_TM_HEAP_MB}” -lt “0” ]]; then echo “[ERROR] Configured TaskManager JVM heap size is not a number. Please set ‘${KEY_TASKM_MEM_SIZE}’ in ${FLINK_CONF_FILE}.” exit 1 fi if [ “${FLINK_TM_HEAP_MB}” -gt “0” ]; then TM_HEAP_SIZE=$(calculateTaskManagerHeapSizeMB) # Long.MAX_VALUE in TB: This is an upper bound, much less direct memory will be used TM_MAX_OFFHEAP_SIZE=“8388607T” export JVM_ARGS="${JVM_ARGS} -Xms${TM_HEAP_SIZE}M -Xmx${TM_HEAP_SIZE}M -XX:MaxDirectMemorySize=${TM_MAX_OFFHEAP_SIZE}” fi # Add TaskManager-specific JVM options export FLINK_ENV_JAVA_OPTS="${FLINK_ENV_JAVA_OPTS} ${FLINK_ENV_JAVA_OPTS_TM}” # Startup parameters ARGS+=(”–configDir” “${FLINK_CONF_DIR}")fiif [[ $STARTSTOP == “start-foreground” ]]; then exec “${FLINK_BIN_DIR}"/flink-console.sh $ENTRYPOINT “${ARGS[@]}“else if [[ $FLINK_TM_COMPUTE_NUMA == “false” ]]; then # Start a single TaskManager “${FLINK_BIN_DIR}"/flink-daemon.sh $STARTSTOP $ENTRYPOINT “${ARGS[@]}” else # Example output from numactl --show on an AWS c4.8xlarge: # policy: default # preferred node: current # physcpubind: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 # cpubind: 0 1 # nodebind: 0 1 # membind: 0 1 read -ra NODE_LIST <<< $(numactl –show | grep “^nodebind: “) for NODE_ID in “${NODE_LIST[@]:1}”; do # Start a TaskManager for each NUMA node numactl –membind=$NODE_ID –cpunodebind=$NODE_ID – “${FLINK_BIN_DIR}"/flink-daemon.sh $STARTSTOP $ENTRYPOINT “${ARGS[@]}” done fifitaskmanager.sh首先调用config.sh初始化相关变量，之后计算并export了JVM_ARGS及FLINK_ENV_JAVA_OPTS，最后调用flink-console.sh启动相关类如果FLINK_TM_MEM_PRE_ALLOCATE为false且FLINK_ENV_JAVA_OPTS及FLINK_ENV_JAVA_OPTS_TM都没有设置，则追加-XX:+UseG1GC到JVM_ARGS；之后读取FLINK_TM_HEAP到FLINK_TM_HEAP_MB；如果FLINK_TM_HEAP_MB大于0则通过calculateTaskManagerHeapSizeMB计算TM_HEAP_SIZE，然后以TM_HEAP_SIZE设置xms及Xmx，以TM_MAX_OFFHEAP_SIZE设置MaxDirectMemorySize，追加到JVM_ARGS中；而FLINK_ENV_JAVA_OPTS_TM则会追加到FLINK_ENV_JAVA_OPTScalculateTaskManagerHeapSizeMB在config.sh中有定义，另外其对应的java代码在TaskManagerServices.calculateHeapSizeMBTaskManagerServicesflink-runtime_2.11-1.7.2-sources.jar!/org/apache/flink/runtime/taskexecutor/TaskManagerServices.javapublic class TaskManagerServices { //…… /* * Calculates the amount of heap memory to use (to set via <tt>-Xmx</tt> and <tt>-Xms</tt>) * based on the total memory to use and the given configuration parameters. * * @param totalJavaMemorySizeMB * overall available memory to use (heap and off-heap) * @param config * configuration object * * @return heap memory to use (in megabytes) / public static long calculateHeapSizeMB(long totalJavaMemorySizeMB, Configuration config) { Preconditions.checkArgument(totalJavaMemorySizeMB > 0); // subtract the Java memory used for network buffers (always off-heap) final long networkBufMB = calculateNetworkBufferMemory( totalJavaMemorySizeMB << 20, // megabytes to bytes config) >> 20; // bytes to megabytes final long remainingJavaMemorySizeMB = totalJavaMemorySizeMB - networkBufMB; // split the available Java memory between heap and off-heap final boolean useOffHeap = config.getBoolean(TaskManagerOptions.MEMORY_OFF_HEAP); final long heapSizeMB; if (useOffHeap) { long offHeapSize; String managedMemorySizeDefaultVal = TaskManagerOptions.MANAGED_MEMORY_SIZE.defaultValue(); if (!config.getString(TaskManagerOptions.MANAGED_MEMORY_SIZE).equals(managedMemorySizeDefaultVal)) { try { offHeapSize = MemorySize.parse(config.getString(TaskManagerOptions.MANAGED_MEMORY_SIZE), MEGA_BYTES).getMebiBytes(); } catch (IllegalArgumentException e) { throw new IllegalConfigurationException( “Could not read " + TaskManagerOptions.MANAGED_MEMORY_SIZE.key(), e); } } else { offHeapSize = Long.valueOf(managedMemorySizeDefaultVal); } if (offHeapSize <= 0) { // calculate off-heap section via fraction double fraction = config.getFloat(TaskManagerOptions.MANAGED_MEMORY_FRACTION); offHeapSize = (long) (fraction * remainingJavaMemorySizeMB); } TaskManagerServicesConfiguration .checkConfigParameter(offHeapSize < remainingJavaMemorySizeMB, offHeapSize, TaskManagerOptions.MANAGED_MEMORY_SIZE.key(), “Managed memory size too large for " + networkBufMB + " MB network buffer memory and a total of " + totalJavaMemorySizeMB + " MB JVM memory”); heapSizeMB = remainingJavaMemorySizeMB - offHeapSize; } else { heapSizeMB = remainingJavaMemorySizeMB; } return heapSizeMB; } /* * Calculates the amount of memory used for network buffers based on the total memory to use and * the according configuration parameters. * * The following configuration parameters are involved: * <ul> * <li>{@link TaskManagerOptions#NETWORK_BUFFERS_MEMORY_FRACTION},</li> * <li>{@link TaskManagerOptions#NETWORK_BUFFERS_MEMORY_MIN},</li> * <li>{@link TaskManagerOptions#NETWORK_BUFFERS_MEMORY_MAX}, and</li> * <li>{@link TaskManagerOptions#NETWORK_NUM_BUFFERS} (fallback if the ones above do not exist)</li> * </ul>. * * @param totalJavaMemorySize * overall available memory to use (heap and off-heap, in bytes) * @param config * configuration object * * @return memory to use for network buffers (in bytes); at least one memory segment */ @SuppressWarnings(“deprecation”) public static long calculateNetworkBufferMemory(long totalJavaMemorySize, Configuration config) { Preconditions.checkArgument(totalJavaMemorySize > 0); int segmentSize = checkedDownCast(MemorySize.parse(config.getString(TaskManagerOptions.MEMORY_SEGMENT_SIZE)).getBytes()); final long networkBufBytes; if (TaskManagerServicesConfiguration.hasNewNetworkBufConf(config)) { // new configuration based on fractions of available memory with selectable min and max float networkBufFraction = config.getFloat(TaskManagerOptions.NETWORK_BUFFERS_MEMORY_FRACTION); long networkBufMin = MemorySize.parse(config.getString(TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MIN)).getBytes(); long networkBufMax = MemorySize.parse(config.getString(TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MAX)).getBytes(); TaskManagerServicesConfiguration .checkNetworkBufferConfig(segmentSize, networkBufFraction, networkBufMin, networkBufMax); networkBufBytes = Math.min(networkBufMax, Math.max(networkBufMin, (long) (networkBufFraction * totalJavaMemorySize))); TaskManagerServicesConfiguration .checkConfigParameter(networkBufBytes < totalJavaMemorySize, “(” + networkBufFraction + “, " + networkBufMin + “, " + networkBufMax + “)”, “(” + TaskManagerOptions.NETWORK_BUFFERS_MEMORY_FRACTION.key() + “, " + TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MIN.key() + “, " + TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MAX.key() + “)”, “Network buffer memory size too large: " + networkBufBytes + " >= " + totalJavaMemorySize + " (total JVM memory size)”); TaskManagerServicesConfiguration .checkConfigParameter(networkBufBytes >= segmentSize, “(” + networkBufFraction + “, " + networkBufMin + “, " + networkBufMax + “)”, “(” + TaskManagerOptions.NETWORK_BUFFERS_MEMORY_FRACTION.key() + “, " + TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MIN.key() + “, " + TaskManagerOptions.NETWORK_BUFFERS_MEMORY_MAX.key() + “)”, “Network buffer memory size too small: " + networkBufBytes + " < " + segmentSize + " (” + TaskManagerOptions.MEMORY_SEGMENT_SIZE.key() + “)”); } else { // use old (deprecated) network buffers parameter int numNetworkBuffers = config.getInteger(TaskManagerOptions.NETWORK_NUM_BUFFERS); networkBufBytes = (long) numNetworkBuffers * (long) segmentSize; TaskManagerServicesConfiguration.checkNetworkConfigOld(numNetworkBuffers); TaskManagerServicesConfiguration .checkConfigParameter(networkBufBytes < totalJavaMemorySize, networkBufBytes, TaskManagerOptions.NETWORK_NUM_BUFFERS.key(), “Network buffer memory size too large: " + networkBufBytes + " >= " + totalJavaMemorySize + " (total JVM memory size)”); TaskManagerServicesConfiguration .checkConfigParameter(networkBufBytes >= segmentSize, networkBufBytes, TaskManagerOptions.NETWORK_NUM_BUFFERS.key(), “Network buffer memory size too small: " + networkBufBytes + " < " + segmentSize + " (” + TaskManagerOptions.MEMORY_SEGMENT_SIZE.key() + “)”); } return networkBufBytes; } //……}FLINK_TM_HEAP设置的是taskmanager的memory(heap及offHeap)大小，而network buffers总是使用offHeap，因而这里首先要从FLINK_TM_HEAP扣减掉这部分offHeap然后重新计算Xms及XmxcalculateHeapSizeMB先调用calculateNetworkBufferMemory计算networkBufMB，然后从totalJavaMemorySizeMB扣减掉networkBufMB得到remainingJavaMemorySizeMB之后读取taskmanager.memory.off-heap设置，默认为false，则直接以remainingJavaMemorySizeMB返回；如果为true，则需要计算offHeapSize的值，然后从remainingJavaMemorySizeMB扣减offHeapSize再返回小结flink-conf.yaml提供了taskmanager.heap.size来设置taskmanager的memory(heap及offHeap)大小；提供了taskmanager.memory相关配置(taskmanager.memory.fraction、taskmanager.memory.off-heap、taskmanager.memory.preallocate、taskmanager.memory.segment-size、taskmanager.memory.size)用于设置memory；提供了taskmanager.network.memory相关配置(taskmanager.network.detailed-metrics、taskmanager.network.memory.buffers-per-channel、taskmanager.network.memory.floating-buffers-per-gate、taskmanager.network.memory.fraction、taskmanager.network.memory.max、taskmanager.network.memory.min)用于设置taskmanager的network stack的内存taskmanager.sh首先调用config.sh初始化相关变量，之后计算并export了JVM_ARGS及FLINK_ENV_JAVA_OPTS，最后调用flink-console.sh启动相关类；如果FLINK_TM_MEM_PRE_ALLOCATE为false且FLINK_ENV_JAVA_OPTS及FLINK_ENV_JAVA_OPTS_TM都没有设置，则追加-XX:+UseG1GC到JVM_ARGS；之后读取FLINK_TM_HEAP到FLINK_TM_HEAP_MB；如果FLINK_TM_HEAP_MB大于0则通过calculateTaskManagerHeapSizeMB计算TM_HEAP_SIZE，然后以TM_HEAP_SIZE设置xms及Xmx，以TM_MAX_OFFHEAP_SIZE设置MaxDirectMemorySize，追加到JVM_ARGS中；而FLINK_ENV_JAVA_OPTS_TM则会追加到FLINK_ENV_JAVA_OPTS；calculateTaskManagerHeapSizeMB在config.sh中有定义，另外其对应的java代码在TaskManagerServices.calculateHeapSizeMBFLINK_TM_HEAP设置的是taskmanager的memory(heap及offHeap)大小，而network buffers总是使用offHeap，因而这里首先要从FLINK_TM_HEAP扣减掉这部分offHeap然后重新计算Xms及Xmx；calculateHeapSizeMB先调用calculateNetworkBufferMemory计算networkBufMB，然后从totalJavaMemorySizeMB扣减掉networkBufMB得到remainingJavaMemorySizeMB；之后读取taskmanager.memory.off-heap设置，默认为false，则直接以remainingJavaMemorySizeMB返回；如果为true，则需要计算offHeapSize的值，然后从remainingJavaMemorySizeMB扣减offHeapSize再返回由此可见最后的jvm参数取决于JVM_ARGS及FLINK_ENV_JAVA_OPTS；其中注意不要设置内存相关参数到JVM_ARGS，因为taskmanager.sh在FLINK_TM_HEAP_MB大于0的时候，则使用该值计算TM_HEAP_SIZE设置Xms及Xmx追加到JVM_ARGS变量中，而FLINK_TM_HEAP_MB则取决于FLINK_TM_HEAP或者taskmanager.heap.size配置；FLINK_ENV_JAVA_OPTS的配置则取决于env.java.opts以及env.java.opts.taskmanager；因而要配置taskmanager的memory(heap及offHeap)大小，可以指定FLINK_TM_HEAP环境变量(比如FLINK_TM_HEAP=512m)，或者在flink-conf.yaml中指定taskmanager.heap.size；而最终的Xms及Xmx则是FLINK_TM_HEAP扣减掉offHeap而来，确定使用offHeap为network buffers，其余的看是否开启taskmanager.memory.off-heap，默认为falsedocTaskManager ...

聊聊flink JobManager的heap大小设置

序本文主要研究一下flink JobManager的heap大小设置JobManagerOptionsflink-core-1.7.1-sources.jar!/org/apache/flink/configuration/JobManagerOptions.java@PublicEvolvingpublic class JobManagerOptions { //…… /** * JVM heap size for the JobManager with memory size. / @Documentation.CommonOption(position = Documentation.CommonOption.POSITION_MEMORY) public static final ConfigOption<String> JOB_MANAGER_HEAP_MEMORY = key(“jobmanager.heap.size”) .defaultValue(“1024m”) .withDescription(“JVM heap size for the JobManager.”); /* * JVM heap size (in megabytes) for the JobManager. * @deprecated use {@link #JOB_MANAGER_HEAP_MEMORY} / @Deprecated public static final ConfigOption<Integer> JOB_MANAGER_HEAP_MEMORY_MB = key(“jobmanager.heap.mb”) .defaultValue(1024) .withDescription(“JVM heap size (in megabytes) for the JobManager.”); //……}jobmanager.heap.size配置用于指定JobManager的大小，默认是1024m；jobmanager.heap.mb配置已经被废弃ConfigurationUtilsflink-core-1.7.1-sources.jar!/org/apache/flink/configuration/ConfigurationUtils.javapublic class ConfigurationUtils { private static final String[] EMPTY = new String[0]; /* * Get job manager’s heap memory. This method will check the new key * {@link JobManagerOptions#JOB_MANAGER_HEAP_MEMORY} and * the old key {@link JobManagerOptions#JOB_MANAGER_HEAP_MEMORY_MB} for backwards compatibility. * * @param configuration the configuration object * @return the memory size of job manager’s heap memory. / public static MemorySize getJobManagerHeapMemory(Configuration configuration) { if (configuration.containsKey(JobManagerOptions.JOB_MANAGER_HEAP_MEMORY.key())) { return MemorySize.parse(configuration.getString(JobManagerOptions.JOB_MANAGER_HEAP_MEMORY)); } else if (configuration.containsKey(JobManagerOptions.JOB_MANAGER_HEAP_MEMORY_MB.key())) { return MemorySize.parse(configuration.getInteger(JobManagerOptions.JOB_MANAGER_HEAP_MEMORY_MB) + “m”); } else { //use default value return MemorySize.parse(configuration.getString(JobManagerOptions.JOB_MANAGER_HEAP_MEMORY)); } } //……}ConfigurationUtils的getJobManagerHeapMemory方法从Configuration中读取配置，然后解析为MemorySizeMemorySizeflink-core-1.7.1-sources.jar!/org/apache/flink/configuration/MemorySize.java@PublicEvolvingpublic class MemorySize implements java.io.Serializable { private static final long serialVersionUID = 1L; // ———————————————————————— /* The memory size, in bytes. / private final long bytes; /* * Constructs a new MemorySize. * * @param bytes The size, in bytes. Must be zero or larger. / public MemorySize(long bytes) { checkArgument(bytes >= 0, “bytes must be >= 0”); this.bytes = bytes; } // ———————————————————————— /* * Gets the memory size in bytes. / public long getBytes() { return bytes; } /* * Gets the memory size in Kibibytes (= 1024 bytes). / public long getKibiBytes() { return bytes >> 10; } /* * Gets the memory size in Mebibytes (= 1024 Kibibytes). / public int getMebiBytes() { return (int) (bytes >> 20); } /* * Gets the memory size in Gibibytes (= 1024 Mebibytes). / public long getGibiBytes() { return bytes >> 30; } /* * Gets the memory size in Tebibytes (= 1024 Gibibytes). / public long getTebiBytes() { return bytes >> 40; } // ———————————————————————— @Override public int hashCode() { return (int) (bytes ^ (bytes >>> 32)); } @Override public boolean equals(Object obj) { return obj == this || (obj != null && obj.getClass() == this.getClass() && ((MemorySize) obj).bytes == this.bytes); } @Override public String toString() { return bytes + " bytes"; } // ———————————————————————— // Parsing // ———————————————————————— /* * Parses the given string as as MemorySize. * * @param text The string to parse * @return The parsed MemorySize * * @throws IllegalArgumentException Thrown, if the expression cannot be parsed. / public static MemorySize parse(String text) throws IllegalArgumentException { return new MemorySize(parseBytes(text)); } /* * Parses the given string with a default unit. * * @param text The string to parse. * @param defaultUnit specify the default unit. * @return The parsed MemorySize. * * @throws IllegalArgumentException Thrown, if the expression cannot be parsed. / public static MemorySize parse(String text, MemoryUnit defaultUnit) throws IllegalArgumentException { if (!hasUnit(text)) { return parse(text + defaultUnit.getUnits()[0]); } return parse(text); } /* * Parses the given string as bytes. * The supported expressions are listed under {@link MemorySize}. * * @param text The string to parse * @return The parsed size, in bytes. * * @throws IllegalArgumentException Thrown, if the expression cannot be parsed. / public static long parseBytes(String text) throws IllegalArgumentException { checkNotNull(text, “text”); final String trimmed = text.trim(); checkArgument(!trimmed.isEmpty(), “argument is an empty- or whitespace-only string”); final int len = trimmed.length(); int pos = 0; char current; while (pos < len && (current = trimmed.charAt(pos)) >= ‘0’ && current <= ‘9’) { pos++; } final String number = trimmed.substring(0, pos); final String unit = trimmed.substring(pos).trim().toLowerCase(Locale.US); if (number.isEmpty()) { throw new NumberFormatException(“text does not start with a number”); } final long value; try { value = Long.parseLong(number); // this throws a NumberFormatException on overflow } catch (NumberFormatException e) { throw new IllegalArgumentException(“The value ‘” + number + “’ cannot be re represented as 64bit number (numeric overflow).”); } final long multiplier; if (unit.isEmpty()) { multiplier = 1L; } else { if (matchesAny(unit, BYTES)) { multiplier = 1L; } else if (matchesAny(unit, KILO_BYTES)) { multiplier = 1024L; } else if (matchesAny(unit, MEGA_BYTES)) { multiplier = 1024L * 1024L; } else if (matchesAny(unit, GIGA_BYTES)) { multiplier = 1024L * 1024L * 1024L; } else if (matchesAny(unit, TERA_BYTES)) { multiplier = 1024L * 1024L * 1024L * 1024L; } else { throw new IllegalArgumentException(“Memory size unit ‘” + unit + “’ does not match any of the recognized units: " + MemoryUnit.getAllUnits()); } } final long result = value * multiplier; // check for overflow if (result / multiplier != value) { throw new IllegalArgumentException(“The value ‘” + text + “’ cannot be re represented as 64bit number of bytes (numeric overflow).”); } return result; } private static boolean matchesAny(String str, MemoryUnit unit) { for (String s : unit.getUnits()) { if (s.equals(str)) { return true; } } return false; } //……}MemorySize内部有个bytes字段，以bytes为单位，之后提供了getBytes、getKibiBytes、getMebiBytes、getGibiBytes、getTebiBytes方法用于快速换算；parse静态方法用于从文本中解析并创建MemorySize，其中parse方法可接收MemoryUnit参数用于文本中没有MemoryUnit时才使用的默认的MemoryUnit，最后都是调用的parseBytes方法MemoryUnitflink-core-1.7.1-sources.jar!/org/apache/flink/configuration/MemorySize.java /* * Enum which defines memory unit, mostly used to parse value from configuration file. * * To make larger values more compact, the common size suffixes are supported: * * <ul> * <li>q or 1b or 1bytes (bytes) * <li>1k or 1kb or 1kibibytes (interpreted as kibibytes = 1024 bytes) * <li>1m or 1mb or 1mebibytes (interpreted as mebibytes = 1024 kibibytes) * <li>1g or 1gb or 1gibibytes (interpreted as gibibytes = 1024 mebibytes) * <li>1t or 1tb or 1tebibytes (interpreted as tebibytes = 1024 gibibytes) * </ul> * / public enum MemoryUnit { BYTES(new String[] { “b”, “bytes” }), KILO_BYTES(new String[] { “k”, “kb”, “kibibytes” }), MEGA_BYTES(new String[] { “m”, “mb”, “mebibytes” }), GIGA_BYTES(new String[] { “g”, “gb”, “gibibytes” }), TERA_BYTES(new String[] { “t”, “tb”, “tebibytes” }); private String[] units; MemoryUnit(String[] units) { this.units = units; } public String[] getUnits() { return units; } public static String getAllUnits() { return concatenateUnits(BYTES.getUnits(), KILO_BYTES.getUnits(), MEGA_BYTES.getUnits(), GIGA_BYTES.getUnits(), TERA_BYTES.getUnits()); } public static boolean hasUnit(String text) { checkNotNull(text, “text”); final String trimmed = text.trim(); checkArgument(!trimmed.isEmpty(), “argument is an empty- or whitespace-only string”); final int len = trimmed.length(); int pos = 0; char current; while (pos < len && (current = trimmed.charAt(pos)) >= ‘0’ && current <= ‘9’) { pos++; } final String unit = trimmed.substring(pos).trim().toLowerCase(Locale.US); return unit.length() > 0; } private static String concatenateUnits(final String[]… allUnits) { final StringBuilder builder = new StringBuilder(128); for (String[] units : allUnits) { builder.append(’(’); for (String unit : units) { builder.append(unit); builder.append(” | “); } builder.setLength(builder.length() - 3); builder.append(”) / “); } builder.setLength(builder.length() - 3); return builder.toString(); } }MemoryUnit枚举定义了BYTES、KILO_BYTES、MEGA_BYTES、GIGA_BYTES、TERA_BYTES；它有units属性，是一个string数组，用于指定每类单位的文本标识，最后匹配时都是转换为小写来匹配FlinkYarnSessionCliflink-1.7.1/flink-yarn/src/main/java/org/apache/flink/yarn/cli/FlinkYarnSessionCli.javapublic class FlinkYarnSessionCli extends AbstractCustomCommandLine<ApplicationId> { //…… private ClusterSpecification createClusterSpecification(Configuration configuration, CommandLine cmd) { if (cmd.hasOption(container.getOpt())) { // number of containers is required option! LOG.info(“The argument {} is deprecated in will be ignored.”, container.getOpt()); } // TODO: The number of task manager should be deprecated soon final int numberTaskManagers; if (cmd.hasOption(container.getOpt())) { numberTaskManagers = Integer.valueOf(cmd.getOptionValue(container.getOpt())); } else { numberTaskManagers = 1; } // JobManager Memory final int jobManagerMemoryMB = ConfigurationUtils.getJobManagerHeapMemory(configuration).getMebiBytes(); // Task Managers memory final int taskManagerMemoryMB = ConfigurationUtils.getTaskManagerHeapMemory(configuration).getMebiBytes(); int slotsPerTaskManager = configuration.getInteger(TaskManagerOptions.NUM_TASK_SLOTS); return new ClusterSpecification.ClusterSpecificationBuilder() .setMasterMemoryMB(jobManagerMemoryMB) .setTaskManagerMemoryMB(taskManagerMemoryMB) .setNumberTaskManagers(numberTaskManagers) .setSlotsPerTaskManager(slotsPerTaskManager) .createClusterSpecification(); } //……}FlinkYarnSessionCli的createClusterSpecification方法使用到了ConfigurationUtils.getJobManagerHeapMemory(configuration)来读取jobManagerMemoryMBconfig.shflink-1.7.1/flink-dist/src/main/flink-bin/bin/config.sh//……DEFAULT_ENV_PID_DIR="/tmp” # Directory to store .pid files toDEFAULT_ENV_LOG_MAX=5 # Maximum number of old log files to keepDEFAULT_ENV_JAVA_OPTS="" # Optional JVM argsDEFAULT_ENV_JAVA_OPTS_JM="" # Optional JVM args (JobManager)DEFAULT_ENV_JAVA_OPTS_TM="" # Optional JVM args (TaskManager)DEFAULT_ENV_JAVA_OPTS_HS="" # Optional JVM args (HistoryServer)DEFAULT_ENV_SSH_OPTS="" # Optional SSH parameters running in cluster modeDEFAULT_YARN_CONF_DIR="" # YARN Configuration Directory, if necessaryDEFAULT_HADOOP_CONF_DIR="" # Hadoop Configuration Directory, if necessary//……# Define FLINK_JM_HEAP if it is not already setif [ -z “${FLINK_JM_HEAP}” ]; then FLINK_JM_HEAP=$(readFromConfig ${KEY_JOBM_MEM_SIZE} 0 “${YAML_CONF}")fi# Try read old config key, if new key not existsif [ “${FLINK_JM_HEAP}” == 0 ]; then FLINK_JM_HEAP_MB=$(readFromConfig ${KEY_JOBM_MEM_MB} 0 “${YAML_CONF}")fi//……if [ -z “${FLINK_ENV_JAVA_OPTS}” ]; then FLINK_ENV_JAVA_OPTS=$(readFromConfig ${KEY_ENV_JAVA_OPTS} “${DEFAULT_ENV_JAVA_OPTS}” “${YAML_CONF}”) # Remove leading and ending double quotes (if present) of value FLINK_ENV_JAVA_OPTS="$( echo “${FLINK_ENV_JAVA_OPTS}” | sed -e ’s/^”//’ -e ’s/"$//’ )“fiif [ -z “${FLINK_ENV_JAVA_OPTS_JM}” ]; then FLINK_ENV_JAVA_OPTS_JM=$(readFromConfig ${KEY_ENV_JAVA_OPTS_JM} “${DEFAULT_ENV_JAVA_OPTS_JM}” “${YAML_CONF}”) # Remove leading and ending double quotes (if present) of value FLINK_ENV_JAVA_OPTS_JM="$( echo “${FLINK_ENV_JAVA_OPTS_JM}” | sed -e ’s/^”//’ -e ’s/"$//’ )“fi//……# Arguments for the JVM. Used for job and task manager JVMs.# DO NOT USE FOR MEMORY SETTINGS! Use conf/flink-conf.yaml with keys# KEY_JOBM_MEM_SIZE and KEY_TASKM_MEM_SIZE for that!if [ -z “${JVM_ARGS}” ]; then JVM_ARGS=““fi//……config.sh首先判断环境变量FLINK_JM_HEAP是否有设置，没有的话，则从flink-conf.yaml中读取jobmanager.heap.size配置到FLINK_JM_HEAP；如果FLINK_JM_HEAP为0，则读取jobmanager.heap.mb的配置到FLINK_JM_HEAP_MB如果没有设置FLINK_ENV_JAVA_OPTS，则从flink-conf.yaml中读取env.java.opts配置，如果没有该配置则使用DEFAULT_ENV_JAVA_OPTS，默认为空；如果没有设置FLINK_ENV_JAVA_OPTS_JM，则从flink-conf.yaml中读取env.java.opts.jobmanager配置，如果没有该配置则使用DEFAULT_ENV_JAVA_OPTS_JM，默认为空JVM_ARGS变量会被job及task manager使用，如果没有设置，则初始化为空；注意不要设置内存相关参数到JVM_ARGS，要使用flink-conf.yaml中的jobmanager.heap.size、taskmanager.heap.size来配置jobmanager.shflink-1.7.1/flink-dist/src/main/flink-bin/bin/jobmanager.sh#!/usr/bin/env bash################################################################################# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# “License”); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an “AS IS” BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.################################################################################# Start/stop a Flink JobManager.USAGE=“Usage: jobmanager.sh ((start|start-foreground) [host] [webui-port])|stop|stop-all"STARTSTOP=$1HOST=$2 # optional when starting multiple instancesWEBUIPORT=$3 # optional when starting multiple instancesif [[ $STARTSTOP != “start” ]] && [[ $STARTSTOP != “start-foreground” ]] && [[ $STARTSTOP != “stop” ]] && [[ $STARTSTOP != “stop-all” ]]; then echo $USAGE exit 1fibin=dirname "$0"bin=cd "$bin"; pwd. “$bin”/config.shENTRYPOINT=standalonesessionif [[ $STARTSTOP == “start” ]] || [[ $STARTSTOP == “start-foreground” ]]; then if [ ! -z “${FLINK_JM_HEAP_MB}” ] && [ “${FLINK_JM_HEAP}” == 0 ]; then echo “used deprecated key `${KEY_JOBM_MEM_MB}`, please replace with key `${KEY_JOBM_MEM_SIZE}`” else flink_jm_heap_bytes=$(parseBytes ${FLINK_JM_HEAP}) FLINK_JM_HEAP_MB=$(getMebiBytes ${flink_jm_heap_bytes}) fi if [[ ! ${FLINK_JM_HEAP_MB} =~ $IS_NUMBER ]] || [[ “${FLINK_JM_HEAP_MB}” -lt “0” ]]; then echo “[ERROR] Configured JobManager memory size is not a valid value. Please set ‘${KEY_JOBM_MEM_SIZE}’ in ${FLINK_CONF_FILE}.” exit 1 fi if [ “${FLINK_JM_HEAP_MB}” -gt “0” ]; then export JVM_ARGS="$JVM_ARGS -Xms”$FLINK_JM_HEAP_MB"m -Xmx”$FLINK_JM_HEAP_MB"m” fi # Add JobManager-specific JVM options export FLINK_ENV_JAVA_OPTS="${FLINK_ENV_JAVA_OPTS} ${FLINK_ENV_JAVA_OPTS_JM}” # Startup parameters args=(”–configDir" “${FLINK_CONF_DIR}” “–executionMode” “cluster”) if [ ! -z $HOST ]; then args+=("–host") args+=("${HOST}") fi if [ ! -z $WEBUIPORT ]; then args+=("–webui-port") args+=("${WEBUIPORT}") fifiif [[ $STARTSTOP == “start-foreground” ]]; then exec “${FLINK_BIN_DIR}"/flink-console.sh $ENTRYPOINT “${args[@]}“else “${FLINK_BIN_DIR}"/flink-daemon.sh $STARTSTOP $ENTRYPOINT “${args[@]}“fijobmanager.sh首先调用config.sh来初始化相关变量(FLINK_JM_HEAP、FLINK_JM_HEAP_MB、FLINK_ENV_JAVA_OPTS、FLINK_ENV_JAVA_OPTS_JM、JVM_ARGS)如果FLINK_JM_HEAP值大于0，则解析到FLINK_JM_HEAP_MB变量；如果FLINK_JM_HEAP_MB大于0，则使用该值设置Xms及Xmx追加到JVM_ARGS变量中；然后将FLINK_ENV_JAVA_OPTS_JM(依据env.java.opts.jobmanager配置)追加到FLINK_ENV_JAVA_OPTS(依据env.java.opts)中jobmanager.sh最后调用flink-console.sh来启动相关类flink-console.shflink-1.7.1/flink-dist/src/main/flink-bin/bin/flink-console.sh#!/usr/bin/env bash################################################################################# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# “License”); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an “AS IS” BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.################################################################################# Start a Flink service as a console application. Must be stopped with Ctrl-C# or with SIGTERM by kill or the controlling process.USAGE=“Usage: flink-console.sh (taskexecutor|zookeeper|historyserver|standalonesession|standalonejob) [args]“SERVICE=$1ARGS=("${@:2}”) # get remaining arguments as arraybin=dirname "$0"bin=cd "$bin"; pwd. “$bin”/config.shcase $SERVICE in (taskexecutor) CLASS_TO_RUN=org.apache.flink.runtime.taskexecutor.TaskManagerRunner ;; (historyserver) CLASS_TO_RUN=org.apache.flink.runtime.webmonitor.history.HistoryServer ;; (zookeeper) CLASS_TO_RUN=org.apache.flink.runtime.zookeeper.FlinkZooKeeperQuorumPeer ;; (standalonesession) CLASS_TO_RUN=org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint ;; (standalonejob) CLASS_TO_RUN=org.apache.flink.container.entrypoint.StandaloneJobClusterEntryPoint ;; () echo “Unknown service ‘${SERVICE}’. $USAGE.” exit 1 ;;esacFLINK_TM_CLASSPATH=constructFlinkClassPathlog_setting=("-Dlog4j.configuration=file:${FLINK_CONF_DIR}/log4j-console.properties” “-Dlogback.configurationFile=file:${FLINK_CONF_DIR}/logback-console.xml”)JAVA_VERSION=$(${JAVA_RUN} -version 2>&1 | sed ’s/.version “(.).(.)..*”/\1\2/; 1q’)# Only set JVM 8 arguments if we have correctly extracted the versionif [[ ${JAVA_VERSION} =~ ${IS_NUMBER} ]]; then if [ “$JAVA_VERSION” -lt 18 ]; then JVM_ARGS="$JVM_ARGS -XX:MaxPermSize=256m” fifiecho “Starting $SERVICE as a console application on host $HOSTNAME.“exec $JAVA_RUN $JVM_ARGS ${FLINK_ENV_JAVA_OPTS} “${log_setting[@]}” -classpath “manglePathList "$FLINK_TM_CLASSPATH:$INTERNAL_HADOOP_CLASSPATHS"” ${CLASS_TO_RUN} “${ARGS[@]}“flink-console.sh在java小于8版本时会给JVM_ARGS追加-XX:MaxPermSize=256m；之后使用JVM_ARGS及FLINK_ENV_JAVA_OPTS作为jvm参数启动CLASS_TO_RUN小结jobmanager.heap.size配置用于指定JobManager的大小，默认是1024m；jobmanager.heap.mb配置已经被废弃；ConfigurationUtils的getJobManagerHeapMemory方法从Configuration中读取配置，然后解析为MemorySize；MemorySize内部有个bytes字段，以bytes为单位，之后提供了getBytes、getKibiBytes、getMebiBytes、getGibiBytes、getTebiBytes方法用于快速换算；parse静态方法用于从文本中解析并创建MemorySize，其中parse方法可接收MemoryUnit参数用于文本中没有MemoryUnit时才使用的默认的MemoryUnit，最后都是调用的parseBytes方法FlinkYarnSessionCli的createClusterSpecification方法使用到了ConfigurationUtils.getJobManagerHeapMemory(configuration)来读取jobManagerMemoryMBconfig.sh首先判断环境变量FLINK_JM_HEAP是否有设置，没有的话，则从flink-conf.yaml中读取jobmanager.heap.size配置到FLINK_JM_HEAP；如果FLINK_JM_HEAP为0，则读取jobmanager.heap.mb的配置到FLINK_JM_HEAP_MB；如果没有设置FLINK_ENV_JAVA_OPTS，则从flink-conf.yaml中读取env.java.opts配置，如果没有该配置则使用DEFAULT_ENV_JAVA_OPTS，默认为空；如果没有设置FLINK_ENV_JAVA_OPTS_JM，则从flink-conf.yaml中读取env.java.opts.jobmanager配置，如果没有该配置则使用DEFAULT_ENV_JAVA_OPTS_JM，默认为空；JVM_ARGS变量会被job及task manager使用，如果没有设置，则初始化为空；注意不要设置内存相关参数到JVM_ARGS，要使用flink-conf.yaml中的jobmanager.heap.size、taskmanager.heap.size来配置jobmanager.sh首先调用config.sh来初始化相关变量(FLINK_JM_HEAP、FLINK_JM_HEAP_MB、FLINK_ENV_JAVA_OPTS、FLINK_ENV_JAVA_OPTS_JM、JVM_ARGS)；如果FLINK_JM_HEAP值大于0，则解析到FLINK_JM_HEAP_MB变量，如果FLINK_JM_HEAP_MB大于0，则使用该值设置Xms及Xmx追加到JVM_ARGS变量中；它会将FLINK_ENV_JAVA_OPTS_JM(依据env.java.opts.jobmanager配置)追加到FLINK_ENV_JAVA_OPTS(依据env.java.opts)中；jobmanager.sh最后调用flink-console.sh来启动相关类flink-console.sh在java小于8版本时会给JVM_ARGS追加-XX:MaxPermSize=256m；之后使用JVM_ARGS及FLINK_ENV_JAVA_OPTS作为jvm参数启动CLASS_TO_RUN由此可见最后的jvm参数取决于JVM_ARGS及FLINK_ENV_JAVA_OPTS；其中注意不要设置内存相关参数到JVM_ARGS，因为jobmanager.sh在FLINK_JM_HEAP_MB大于0，则使用该值设置Xms及Xmx追加到JVM_ARGS变量中，而FLINK_JM_HEAP_MB则取决于FLINK_JM_HEAP或者jobmanager.heap.size配置；FLINK_ENV_JAVA_OPTS的配置则取决于env.java.opts以及env.java.opts.jobmanager；因而要配置jobmanager的heap大小的话，可以指定FLINK_JM_HEAP环境变量(比如FLINK_JM_HEAP=512m)，或者在flink-conf.yaml中指定jobmanager.heap.sizedocjobmanager.heap.size ...

聊聊flink的AbstractNonHaServices

序本文主要研究一下flink的AbstractNonHaServicesHighAvailabilityServicesflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/highavailability/HighAvailabilityServices.javapublic interface HighAvailabilityServices extends AutoCloseable { // ———————————————————————— // Constants // ———————————————————————— /** * This UUID should be used when no proper leader election happens, but a simple * pre-configured leader is used. That is for example the case in non-highly-available * standalone setups. / UUID DEFAULT_LEADER_ID = new UUID(0, 0); /* * This JobID should be used to identify the old JobManager when using the * {@link HighAvailabilityServices}. With the new mode every JobMaster will have a * distinct JobID assigned. / JobID DEFAULT_JOB_ID = new JobID(0L, 0L); // ———————————————————————— // Services // ———————————————————————— /* * Gets the leader retriever for the cluster’s resource manager. / LeaderRetrievalService getResourceManagerLeaderRetriever(); /* * Gets the leader retriever for the dispatcher. This leader retrieval service * is not always accessible. / LeaderRetrievalService getDispatcherLeaderRetriever(); /* * Gets the leader retriever for the job JobMaster which is responsible for the given job * * @param jobID The identifier of the job. * @return Leader retrieval service to retrieve the job manager for the given job * @deprecated This method should only be used by the legacy code where the JobManager acts as the master. / @Deprecated LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID); /* * Gets the leader retriever for the job JobMaster which is responsible for the given job * * @param jobID The identifier of the job. * @param defaultJobManagerAddress JobManager address which will be returned by * a static leader retrieval service. * @return Leader retrieval service to retrieve the job manager for the given job / LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID, String defaultJobManagerAddress); LeaderRetrievalService getWebMonitorLeaderRetriever(); /* * Gets the leader election service for the cluster’s resource manager. * * @return Leader election service for the resource manager leader election / LeaderElectionService getResourceManagerLeaderElectionService(); /* * Gets the leader election service for the cluster’s dispatcher. * * @return Leader election service for the dispatcher leader election / LeaderElectionService getDispatcherLeaderElectionService(); /* * Gets the leader election service for the given job. * * @param jobID The identifier of the job running the election. * @return Leader election service for the job manager leader election / LeaderElectionService getJobManagerLeaderElectionService(JobID jobID); LeaderElectionService getWebMonitorLeaderElectionService(); /* * Gets the checkpoint recovery factory for the job manager * * @return Checkpoint recovery factory / CheckpointRecoveryFactory getCheckpointRecoveryFactory(); /* * Gets the submitted job graph store for the job manager * * @return Submitted job graph store * @throws Exception if the submitted job graph store could not be created / SubmittedJobGraphStore getSubmittedJobGraphStore() throws Exception; /* * Gets the registry that holds information about whether jobs are currently running. * * @return Running job registry to retrieve running jobs / RunningJobsRegistry getRunningJobsRegistry() throws Exception; /* * Creates the BLOB store in which BLOBs are stored in a highly-available fashion. * * @return Blob store * @throws IOException if the blob store could not be created / BlobStore createBlobStore() throws IOException; // ———————————————————————— // Shutdown and Cleanup // ———————————————————————— /* * Closes the high availability services, releasing all resources. * * This method does not delete or clean up any data stored in external stores * (file systems, ZooKeeper, etc). Another instance of the high availability * services will be able to recover the job. * * If an exception occurs during closing services, this method will attempt to * continue closing other services and report exceptions only after all services * have been attempted to be closed. * * @throws Exception Thrown, if an exception occurred while closing these services. / @Override void close() throws Exception; /* * Closes the high availability services (releasing all resources) and deletes * all data stored by these services in external stores. * * After this method was called, the any job or session that was managed by * these high availability services will be unrecoverable. * * If an exception occurs during cleanup, this method will attempt to * continue the cleanup and report exceptions only after all cleanup steps have * been attempted. * * @throws Exception Thrown, if an exception occurred while closing these services * or cleaning up data stored by them. / void closeAndCleanupAllData() throws Exception;}HighAvailabilityServices定义了highly-available所需的各种services的get方法，它有两个直接子类，一个是ZooKeeperHaServices，一个是AbstractNonHaServicesAbstractNonHaServicesflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/highavailability/nonha/AbstractNonHaServices.javapublic abstract class AbstractNonHaServices implements HighAvailabilityServices { protected final Object lock = new Object(); private final RunningJobsRegistry runningJobsRegistry; private final VoidBlobStore voidBlobStore; private boolean shutdown; public AbstractNonHaServices() { this.runningJobsRegistry = new StandaloneRunningJobsRegistry(); this.voidBlobStore = new VoidBlobStore(); shutdown = false; } // ———————————————————————- // HighAvailabilityServices method implementations // ———————————————————————- @Override public CheckpointRecoveryFactory getCheckpointRecoveryFactory() { synchronized (lock) { checkNotShutdown(); return new StandaloneCheckpointRecoveryFactory(); } } @Override public SubmittedJobGraphStore getSubmittedJobGraphStore() throws Exception { synchronized (lock) { checkNotShutdown(); return new StandaloneSubmittedJobGraphStore(); } } @Override public RunningJobsRegistry getRunningJobsRegistry() throws Exception { synchronized (lock) { checkNotShutdown(); return runningJobsRegistry; } } @Override public BlobStore createBlobStore() throws IOException { synchronized (lock) { checkNotShutdown(); return voidBlobStore; } } @Override public void close() throws Exception { synchronized (lock) { if (!shutdown) { shutdown = true; } } } @Override public void closeAndCleanupAllData() throws Exception { // this stores no data, so this method is the same as ‘close()’ close(); } // ———————————————————————- // Helper methods // ———————————————————————- @GuardedBy(“lock”) protected void checkNotShutdown() { checkState(!shutdown, “high availability services are shut down”); } protected boolean isShutDown() { return shutdown; }}AbstractNonHaServices实现了HighAvailabilityServices的getCheckpointRecoveryFactory、getSubmittedJobGraphStore、getRunningJobsRegistry、createBlobStore、close、closeAndCleanupAllData方法；它有两个子类，分别是EmbeddedHaServices及StandaloneHaServicesEmbeddedHaServicesflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/highavailability/nonha/embedded/EmbeddedHaServices.javapublic class EmbeddedHaServices extends AbstractNonHaServices { private final Executor executor; private final EmbeddedLeaderService resourceManagerLeaderService; private final EmbeddedLeaderService dispatcherLeaderService; private final HashMap<JobID, EmbeddedLeaderService> jobManagerLeaderServices; private final EmbeddedLeaderService webMonitorLeaderService; public EmbeddedHaServices(Executor executor) { this.executor = Preconditions.checkNotNull(executor); this.resourceManagerLeaderService = new EmbeddedLeaderService(executor); this.dispatcherLeaderService = new EmbeddedLeaderService(executor); this.jobManagerLeaderServices = new HashMap<>(); this.webMonitorLeaderService = new EmbeddedLeaderService(executor); } // ———————————————————————— // services // ———————————————————————— @Override public LeaderRetrievalService getResourceManagerLeaderRetriever() { return resourceManagerLeaderService.createLeaderRetrievalService(); } @Override public LeaderRetrievalService getDispatcherLeaderRetriever() { return dispatcherLeaderService.createLeaderRetrievalService(); } @Override public LeaderElectionService getResourceManagerLeaderElectionService() { return resourceManagerLeaderService.createLeaderElectionService(); } @Override public LeaderElectionService getDispatcherLeaderElectionService() { return dispatcherLeaderService.createLeaderElectionService(); } @Override public LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID) { checkNotNull(jobID); synchronized (lock) { checkNotShutdown(); EmbeddedLeaderService service = getOrCreateJobManagerService(jobID); return service.createLeaderRetrievalService(); } } @Override public LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID, String defaultJobManagerAddress) { return getJobManagerLeaderRetriever(jobID); } @Override public LeaderRetrievalService getWebMonitorLeaderRetriever() { return webMonitorLeaderService.createLeaderRetrievalService(); } @Override public LeaderElectionService getJobManagerLeaderElectionService(JobID jobID) { checkNotNull(jobID); synchronized (lock) { checkNotShutdown(); EmbeddedLeaderService service = getOrCreateJobManagerService(jobID); return service.createLeaderElectionService(); } } @Override public LeaderElectionService getWebMonitorLeaderElectionService() { return webMonitorLeaderService.createLeaderElectionService(); } // ———————————————————————— // internal // ———————————————————————— @GuardedBy(“lock”) private EmbeddedLeaderService getOrCreateJobManagerService(JobID jobID) { EmbeddedLeaderService service = jobManagerLeaderServices.get(jobID); if (service == null) { service = new EmbeddedLeaderService(executor); jobManagerLeaderServices.put(jobID, service); } return service; } // ———————————————————————— // shutdown // ———————————————————————— @Override public void close() throws Exception { synchronized (lock) { if (!isShutDown()) { // stop all job manager leader services for (EmbeddedLeaderService service : jobManagerLeaderServices.values()) { service.shutdown(); } jobManagerLeaderServices.clear(); resourceManagerLeaderService.shutdown(); webMonitorLeaderService.shutdown(); } super.close(); } }}EmbeddedHaServices继承了AbstractNonHaServices，它是对HighAvailabilityServices接口在ResourceManager, JobManagers, TaskManagers运行在同一个进程的non-high-availability场景下的实现，FlinkMiniCluster使用的就是EmbeddedHaServicesStandaloneHaServicesflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/highavailability/nonha/standalone/StandaloneHaServices.javapublic class StandaloneHaServices extends AbstractNonHaServices { /* The constant name of the ResourceManager RPC endpoint / private static final String RESOURCE_MANAGER_RPC_ENDPOINT_NAME = “resource_manager”; /* The fix address of the ResourceManager / private final String resourceManagerAddress; /* The fix address of the Dispatcher / private final String dispatcherAddress; /* The fix address of the JobManager / private final String jobManagerAddress; private final String webMonitorAddress; /* * Creates a new services class for the fix pre-defined leaders. * * @param resourceManagerAddress The fix address of the ResourceManager * @param webMonitorAddress */ public StandaloneHaServices( String resourceManagerAddress, String dispatcherAddress, String jobManagerAddress, String webMonitorAddress) { this.resourceManagerAddress = checkNotNull(resourceManagerAddress, “resourceManagerAddress”); this.dispatcherAddress = checkNotNull(dispatcherAddress, “dispatcherAddress”); this.jobManagerAddress = checkNotNull(jobManagerAddress, “jobManagerAddress”); this.webMonitorAddress = checkNotNull(webMonitorAddress, webMonitorAddress); } // ———————————————————————— // Services // ———————————————————————— @Override public LeaderRetrievalService getResourceManagerLeaderRetriever() { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderRetrievalService(resourceManagerAddress, DEFAULT_LEADER_ID); } } @Override public LeaderRetrievalService getDispatcherLeaderRetriever() { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderRetrievalService(dispatcherAddress, DEFAULT_LEADER_ID); } } @Override public LeaderElectionService getResourceManagerLeaderElectionService() { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderElectionService(); } } @Override public LeaderElectionService getDispatcherLeaderElectionService() { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderElectionService(); } } @Override public LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID) { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderRetrievalService(jobManagerAddress, DEFAULT_LEADER_ID); } } @Override public LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID, String defaultJobManagerAddress) { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderRetrievalService(defaultJobManagerAddress, DEFAULT_LEADER_ID); } } @Override public LeaderElectionService getJobManagerLeaderElectionService(JobID jobID) { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderElectionService(); } } @Override public LeaderRetrievalService getWebMonitorLeaderRetriever() { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderRetrievalService(webMonitorAddress, DEFAULT_LEADER_ID); } } @Override public LeaderElectionService getWebMonitorLeaderElectionService() { synchronized (lock) { checkNotShutdown(); return new StandaloneLeaderElectionService(); } }}StandaloneHaServices继承了AbstractNonHaServices，它是对HighAvailabilityServices接口在non-high-availability场景下的实现，ClusterEntrypoint在highAvailabilityMode为NONE的时候使用的是StandaloneHaServices小结HighAvailabilityServices定义了highly-available所需的各种services的get方法，它有两个直接子类，一个是ZooKeeperHaServices，一个是AbstractNonHaServicesAbstractNonHaServices实现了HighAvailabilityServices的getCheckpointRecoveryFactory、getSubmittedJobGraphStore、getRunningJobsRegistry、createBlobStore、close、closeAndCleanupAllData方法；它有两个子类，分别是EmbeddedHaServices及StandaloneHaServicesEmbeddedHaServices继承了AbstractNonHaServices，它是对HighAvailabilityServices接口在ResourceManager, JobManagers, TaskManagers运行在同一个进程的non-high-availability场景下的实现，FlinkMiniCluster使用的就是EmbeddedHaServices；StandaloneHaServices继承了AbstractNonHaServices，它是对HighAvailabilityServices接口在non-high-availability场景下的实现，ClusterEntrypoint在highAvailabilityMode为NONE的时候使用的是StandaloneHaServicesdocJobManager High Availability (HA) ...

Flink 1.7 文档翻译活动期待大家的参与 | ApacheCN

参与方式：https://github.com/apachecn/f…整体进度：https://github.com/apachecn/f…项目仓库：https://github.com/apachecn/f…贡献指南请您勇敢地去翻译和改进翻译。虽然我们追求卓越，但我们并不要求您做到十全十美，因此请不要担心因为翻译上犯错——在大部分情况下，我们的服务器已经记录所有的翻译，因此您不必担心会因为您的失误遭到无法挽回的破坏。（改编自维基百科）负责人：zyBourn：379991171章节列表Apache Flink DocumentationDataflow Programming ModelDistributed Runtime EnvironmentDataStream API TutorialLocal Setup TutorialRunning Flink on WindowsExamplesBatch ExamplesProject Template for JavaProject Template for ScalaConfiguring Dependencies, Connectors, LibrariesBasic API ConceptsScala API ExtensionsJava Lambda ExpressionsFlink DataStream API Programming GuideEvent TimeGenerating Timestamps / WatermarksPre-defined Timestamp Extractors / Watermark EmittersState & Fault ToleranceWorking with StateThe Broadcast State PatternCheckpointingQueryable State BetaState BackendsState Schema EvolutionCustom Serialization for Managed StateOperatorsWindowsJoiningProcess Function (Low-level Operations)Asynchronous I/O for External Data AccessStreaming ConnectorsFault Tolerance Guarantees of Data Sources and SinksApache Kafka ConnectorApache Cassandra ConnectorAmazon AWS Kinesis Streams ConnectorElasticsearch ConnectorHDFS ConnectorStreaming File SinkRabbitMQ ConnectorApache NiFi ConnectorTwitter ConnectorSide OutputsPython Programming Guide (Streaming) BetaTestingExperimental FeaturesFlink DataSet API Programming GuideDataSet TransformationsFault ToleranceIterationsZipping Elements in a DataSetConnectorsPython Programming Guide BetaHadoop Compatibility BetaLocal ExecutionCluster ExecutionTable API & SQLConcepts & Common APIStreaming ConceptsDynamic TablesTime AttributesJoins in Continuous QueriesTemporal TablesDetecting Patterns in Tables BetaQuery ConfigurationConnect to External SystemsTable APISQLBuilt-In FunctionsUser-defined Sources & SinksUser-defined FunctionsSQL Client BetaData Types & SerializationRegister a custom serializer for your Flink programExecution ConfigurationProgram Packaging and Distributed ExecutionParallel ExecutionExecution PlansRestart StrategiesFlinkCEP - Complex event processing for FlinkStorm Compatibility BetaGelly: Flink Graph APIGraph APIIterative Graph ProcessingLibrary MethodsGraph AlgorithmsGraph GeneratorsBipartite GraphFlinkML - Machine Learning for FlinkQuickstart GuideAlternating Least SquaresHow to ContributeCross ValidationDistance Metricsk-Nearest Neighbors JoinMinMax ScalerMultiple Linear RegressionLooking under the hood of pipelinesPolynomial FeaturesStochastic Outlier SelectionStandard ScalerSVM using CoCoABest PracticesAPI Migration GuidesStandalone ClusterYARN SetupMesos SetupDocker SetupKubernetes SetupAmazon Web Services (AWS)Google Compute Engine SetupMapR SetupHadoop IntegrationJobManager High Availability (HA)CheckpointsSavepointsState BackendsTuning Checkpoints and Large StateConfigurationProduction Readiness ChecklistCommand-Line InterfaceScala REPLKerberos Authentication Setup and ConfigurationSSL SetupFile SystemsUpgrading Applications and Flink VersionsMetricsHow to use loggingHistory ServerMonitoring CheckpointingMonitoring Back PressureMonitoring REST APIDebugging Windows & Event TimeDebugging ClassloadingApplication ProfilingImporting Flink into an IDEBuilding Flink from SourceComponent StackData Streaming Fault ToleranceJobs and SchedulingTask LifecycleFile Systems流程一、认领首先查看整体进度，确认没有人认领了你想认领的章节。然后回复 ISSUE，注明“章节 + QQ 号”（一定要留 QQ）。二、翻译可以合理利用翻译引擎（例如谷歌），但一定要把它变得可读！如果遇到格式问题，请随手把它改正。三、提交fork Github 项目将译文放在docs文件夹下pushpull request请见 Github 入门指南。 ...

聊聊flink JobManager的High Availability

序本文主要研究一下flink JobManager的High Availability配置flink-conf.yamlhigh-availability: zookeeperhigh-availability.zookeeper.quorum: zookeeper:2181high-availability.zookeeper.path.root: /flinkhigh-availability.cluster-id: /cluster_one # important: customize per clusterhigh-availability.storageDir: file:///sharehigh-availability的可选值为NONE或者zookeeper；high-availability.zookeeper.quorum用于指定zookeeper的peers；high-availability.zookeeper.path.root用于指定在zookeeper的root node路径；high-availability.cluster-id用于指定当前cluster的node名称，该cluster node位于root node下面；high-availability.storageDir用于指定JobManager metadata的存储路径masters文件localhost:8081localhost:8082masters文件用于指定jobmanager的地址HighAvailabilityModeflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/jobmanager/HighAvailabilityMode.javapublic enum HighAvailabilityMode { NONE(false), ZOOKEEPER(true), FACTORY_CLASS(true); private final boolean haActive; HighAvailabilityMode(boolean haActive) { this.haActive = haActive; } /** * Return the configured {@link HighAvailabilityMode}. * * @param config The config to parse * @return Configured recovery mode or {@link HighAvailabilityMode#NONE} if not * configured. / public static HighAvailabilityMode fromConfig(Configuration config) { String haMode = config.getValue(HighAvailabilityOptions.HA_MODE); if (haMode == null) { return HighAvailabilityMode.NONE; } else if (haMode.equalsIgnoreCase(ConfigConstants.DEFAULT_RECOVERY_MODE)) { // Map old default to new default return HighAvailabilityMode.NONE; } else { try { return HighAvailabilityMode.valueOf(haMode.toUpperCase()); } catch (IllegalArgumentException e) { return FACTORY_CLASS; } } } /* * Returns true if the defined recovery mode supports high availability. * * @param configuration Configuration which contains the recovery mode * @return true if high availability is supported by the recovery mode, otherwise false / public static boolean isHighAvailabilityModeActivated(Configuration configuration) { HighAvailabilityMode mode = fromConfig(configuration); return mode.haActive; }}HighAvailabilityMode有三个枚举，分别是NONE、ZOOKEEPER、FACTORY_CLASS；这些枚举有一个属性haActive，用于表示是否支持HighAvailabilityHighAvailabilityOptionsflink-core-1.7.1-sources.jar!/org/apache/flink/configuration/HighAvailabilityOptions.java@PublicEvolving@ConfigGroups(groups = { @ConfigGroup(name = “HighAvailabilityZookeeper”, keyPrefix = “high-availability.zookeeper”)})public class HighAvailabilityOptions { // ———————————————————————— // Required High Availability Options // ———————————————————————— /* * Defines high-availability mode used for the cluster execution. * A value of “NONE” signals no highly available setup. * To enable high-availability, set this mode to “ZOOKEEPER”. * Can also be set to FQN of HighAvailability factory class. / @Documentation.CommonOption(position = Documentation.CommonOption.POSITION_HIGH_AVAILABILITY) public static final ConfigOption<String> HA_MODE = key(“high-availability”) .defaultValue(“NONE”) .withDeprecatedKeys(“recovery.mode”) .withDescription(“Defines high-availability mode used for the cluster execution.” + " To enable high-availability, set this mode to "ZOOKEEPER" or specify FQN of factory class."); /* * The ID of the Flink cluster, used to separate multiple Flink clusters * Needs to be set for standalone clusters, is automatically inferred in YARN and Mesos. / public static final ConfigOption<String> HA_CLUSTER_ID = key(“high-availability.cluster-id”) .defaultValue("/default") .withDeprecatedKeys(“high-availability.zookeeper.path.namespace”, “recovery.zookeeper.path.namespace”) .withDescription(“The ID of the Flink cluster, used to separate multiple Flink clusters from each other.” + " Needs to be set for standalone clusters but is automatically inferred in YARN and Mesos."); /* * File system path (URI) where Flink persists metadata in high-availability setups. / @Documentation.CommonOption(position = Documentation.CommonOption.POSITION_HIGH_AVAILABILITY) public static final ConfigOption<String> HA_STORAGE_PATH = key(“high-availability.storageDir”) .noDefaultValue() .withDeprecatedKeys(“high-availability.zookeeper.storageDir”, “recovery.zookeeper.storageDir”) .withDescription(“File system path (URI) where Flink persists metadata in high-availability setups.”); // ———————————————————————— // Recovery Options // ———————————————————————— /* * Optional port (range) used by the job manager in high-availability mode. / public static final ConfigOption<String> HA_JOB_MANAGER_PORT_RANGE = key(“high-availability.jobmanager.port”) .defaultValue(“0”) .withDeprecatedKeys(“recovery.jobmanager.port”) .withDescription(“Optional port (range) used by the job manager in high-availability mode.”); /* * The time before a JobManager after a fail over recovers the current jobs. / public static final ConfigOption<String> HA_JOB_DELAY = key(“high-availability.job.delay”) .noDefaultValue() .withDeprecatedKeys(“recovery.job.delay”) .withDescription(“The time before a JobManager after a fail over recovers the current jobs.”); // ———————————————————————— // ZooKeeper Options // ———————————————————————— /* * The ZooKeeper quorum to use, when running Flink in a high-availability mode with ZooKeeper. / public static final ConfigOption<String> HA_ZOOKEEPER_QUORUM = key(“high-availability.zookeeper.quorum”) .noDefaultValue() .withDeprecatedKeys(“recovery.zookeeper.quorum”) .withDescription(“The ZooKeeper quorum to use, when running Flink in a high-availability mode with ZooKeeper.”); /* * The root path under which Flink stores its entries in ZooKeeper. / public static final ConfigOption<String> HA_ZOOKEEPER_ROOT = key(“high-availability.zookeeper.path.root”) .defaultValue("/flink") .withDeprecatedKeys(“recovery.zookeeper.path.root”) .withDescription(“The root path under which Flink stores its entries in ZooKeeper.”); public static final ConfigOption<String> HA_ZOOKEEPER_LATCH_PATH = key(“high-availability.zookeeper.path.latch”) .defaultValue("/leaderlatch") .withDeprecatedKeys(“recovery.zookeeper.path.latch”) .withDescription(“Defines the znode of the leader latch which is used to elect the leader.”); /* ZooKeeper root path (ZNode) for job graphs. / public static final ConfigOption<String> HA_ZOOKEEPER_JOBGRAPHS_PATH = key(“high-availability.zookeeper.path.jobgraphs”) .defaultValue("/jobgraphs") .withDeprecatedKeys(“recovery.zookeeper.path.jobgraphs”) .withDescription(“ZooKeeper root path (ZNode) for job graphs”); public static final ConfigOption<String> HA_ZOOKEEPER_LEADER_PATH = key(“high-availability.zookeeper.path.leader”) .defaultValue("/leader") .withDeprecatedKeys(“recovery.zookeeper.path.leader”) .withDescription(“Defines the znode of the leader which contains the URL to the leader and the current” + " leader session ID."); /* ZooKeeper root path (ZNode) for completed checkpoints. / public static final ConfigOption<String> HA_ZOOKEEPER_CHECKPOINTS_PATH = key(“high-availability.zookeeper.path.checkpoints”) .defaultValue("/checkpoints") .withDeprecatedKeys(“recovery.zookeeper.path.checkpoints”) .withDescription(“ZooKeeper root path (ZNode) for completed checkpoints.”); /* ZooKeeper root path (ZNode) for checkpoint counters. / public static final ConfigOption<String> HA_ZOOKEEPER_CHECKPOINT_COUNTER_PATH = key(“high-availability.zookeeper.path.checkpoint-counter”) .defaultValue("/checkpoint-counter") .withDeprecatedKeys(“recovery.zookeeper.path.checkpoint-counter”) .withDescription(“ZooKeeper root path (ZNode) for checkpoint counters.”); /* ZooKeeper root path (ZNode) for Mesos workers. / @PublicEvolving public static final ConfigOption<String> HA_ZOOKEEPER_MESOS_WORKERS_PATH = key(“high-availability.zookeeper.path.mesos-workers”) .defaultValue("/mesos-workers") .withDeprecatedKeys(“recovery.zookeeper.path.mesos-workers”) .withDescription(Description.builder() .text(“The ZooKeeper root path for persisting the Mesos worker information.”) .build()); // ———————————————————————— // ZooKeeper Client Settings // ———————————————————————— public static final ConfigOption<Integer> ZOOKEEPER_SESSION_TIMEOUT = key(“high-availability.zookeeper.client.session-timeout”) .defaultValue(60000) .withDeprecatedKeys(“recovery.zookeeper.client.session-timeout”) .withDescription(“Defines the session timeout for the ZooKeeper session in ms.”); public static final ConfigOption<Integer> ZOOKEEPER_CONNECTION_TIMEOUT = key(“high-availability.zookeeper.client.connection-timeout”) .defaultValue(15000) .withDeprecatedKeys(“recovery.zookeeper.client.connection-timeout”) .withDescription(“Defines the connection timeout for ZooKeeper in ms.”); public static final ConfigOption<Integer> ZOOKEEPER_RETRY_WAIT = key(“high-availability.zookeeper.client.retry-wait”) .defaultValue(5000) .withDeprecatedKeys(“recovery.zookeeper.client.retry-wait”) .withDescription(“Defines the pause between consecutive retries in ms.”); public static final ConfigOption<Integer> ZOOKEEPER_MAX_RETRY_ATTEMPTS = key(“high-availability.zookeeper.client.max-retry-attempts”) .defaultValue(3) .withDeprecatedKeys(“recovery.zookeeper.client.max-retry-attempts”) .withDescription(“Defines the number of connection retries before the client gives up.”); public static final ConfigOption<String> ZOOKEEPER_RUNNING_JOB_REGISTRY_PATH = key(“high-availability.zookeeper.path.running-registry”) .defaultValue("/running_job_registry/"); public static final ConfigOption<String> ZOOKEEPER_CLIENT_ACL = key(“high-availability.zookeeper.client.acl”) .defaultValue(“open”) .withDescription(“Defines the ACL (open|creator) to be configured on ZK node. The configuration value can be” + " set to “creator” if the ZooKeeper server configuration has the “authProvider” property mapped to use" + " SASLAuthenticationProvider and the cluster is configured to run in secure mode (Kerberos)."); // ———————————————————————— /* Not intended to be instantiated. / private HighAvailabilityOptions() {}}HighAvailabilityOptions定义了前缀为high-availability.zookeeper的配置项HighAvailabilityServicesUtilsflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/highavailability/HighAvailabilityServicesUtils.javapublic class HighAvailabilityServicesUtils { public static HighAvailabilityServices createAvailableOrEmbeddedServices( Configuration config, Executor executor) throws Exception { HighAvailabilityMode highAvailabilityMode = LeaderRetrievalUtils.getRecoveryMode(config); switch (highAvailabilityMode) { case NONE: return new EmbeddedHaServices(executor); case ZOOKEEPER: BlobStoreService blobStoreService = BlobUtils.createBlobStoreFromConfig(config); return new ZooKeeperHaServices( ZooKeeperUtils.startCuratorFramework(config), executor, config, blobStoreService); case FACTORY_CLASS: return createCustomHAServices(config, executor); default: throw new Exception(“High availability mode " + highAvailabilityMode + " is not supported.”); } } public static HighAvailabilityServices createHighAvailabilityServices( Configuration configuration, Executor executor, AddressResolution addressResolution) throws Exception { HighAvailabilityMode highAvailabilityMode = LeaderRetrievalUtils.getRecoveryMode(configuration); switch (highAvailabilityMode) { case NONE: final Tuple2<String, Integer> hostnamePort = getJobManagerAddress(configuration); final String jobManagerRpcUrl = AkkaRpcServiceUtils.getRpcUrl( hostnamePort.f0, hostnamePort.f1, JobMaster.JOB_MANAGER_NAME, addressResolution, configuration); final String resourceManagerRpcUrl = AkkaRpcServiceUtils.getRpcUrl( hostnamePort.f0, hostnamePort.f1, ResourceManager.RESOURCE_MANAGER_NAME, addressResolution, configuration); final String dispatcherRpcUrl = AkkaRpcServiceUtils.getRpcUrl( hostnamePort.f0, hostnamePort.f1, Dispatcher.DISPATCHER_NAME, addressResolution, configuration); final String address = checkNotNull(configuration.getString(RestOptions.ADDRESS), “%s must be set”, RestOptions.ADDRESS.key()); final int port = configuration.getInteger(RestOptions.PORT); final boolean enableSSL = SSLUtils.isRestSSLEnabled(configuration); final String protocol = enableSSL ? “https://” : “http://”; return new StandaloneHaServices( resourceManagerRpcUrl, dispatcherRpcUrl, jobManagerRpcUrl, String.format("%s%s:%s", protocol, address, port)); case ZOOKEEPER: BlobStoreService blobStoreService = BlobUtils.createBlobStoreFromConfig(configuration); return new ZooKeeperHaServices( ZooKeeperUtils.startCuratorFramework(configuration), executor, configuration, blobStoreService); case FACTORY_CLASS: return createCustomHAServices(configuration, executor); default: throw new Exception(“Recovery mode " + highAvailabilityMode + " is not supported.”); } } /* * Returns the JobManager’s hostname and port extracted from the given * {@link Configuration}. * * @param configuration Configuration to extract the JobManager’s address from * @return The JobManager’s hostname and port * @throws ConfigurationException if the JobManager’s address cannot be extracted from the configuration / public static Tuple2<String, Integer> getJobManagerAddress(Configuration configuration) throws ConfigurationException { final String hostname = configuration.getString(JobManagerOptions.ADDRESS); final int port = configuration.getInteger(JobManagerOptions.PORT); if (hostname == null) { throw new ConfigurationException(“Config parameter ‘” + JobManagerOptions.ADDRESS + “’ is missing (hostname/address of JobManager to connect to).”); } if (port <= 0 || port >= 65536) { throw new ConfigurationException(“Invalid value for ‘” + JobManagerOptions.PORT + “’ (port of the JobManager actor system) : " + port + “. it must be greater than 0 and less than 65536.”); } return Tuple2.of(hostname, port); } private static HighAvailabilityServices createCustomHAServices(Configuration config, Executor executor) throws FlinkException { final ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); final String haServicesClassName = config.getString(HighAvailabilityOptions.HA_MODE); final HighAvailabilityServicesFactory highAvailabilityServicesFactory; try { highAvailabilityServicesFactory = InstantiationUtil.instantiate( haServicesClassName, HighAvailabilityServicesFactory.class, classLoader); } catch (Exception e) { throw new FlinkException( String.format( “Could not instantiate the HighAvailabilityServicesFactory ‘%s’. Please make sure that this class is on your class path.”, haServicesClassName), e); } try { return highAvailabilityServicesFactory.createHAServices(config, executor); } catch (Exception e) { throw new FlinkException( String.format( “Could not create the ha services from the instantiated HighAvailabilityServicesFactory %s.”, haServicesClassName), e); } } /* * Enum specifying whether address resolution should be tried or not when creating the * {@link HighAvailabilityServices}. / public enum AddressResolution { TRY_ADDRESS_RESOLUTION, NO_ADDRESS_RESOLUTION }}HighAvailabilityServicesUtils提供了创建HighAvailabilityServices的静态方法，这些方法有createAvailableOrEmbeddedServices、createHighAvailabilityServices、createCustomHAServices其中createAvailableOrEmbeddedServices方法主要是给FlinkMiniCluster使用；createHighAvailabilityServices方法主要是给ClusterEntrypoint使用，它在highAvailabilityMode为NONE的时候创建的是StandaloneHaServices，在highAvailabilityMode为ZOOKEEPER创建的是ZooKeeperHaServices，在highAvailabilityMode为FACTORY_CLASS的时候使用createCustomHAServices方法来创建HighAvailabilityServicesUtils还提供了getJobManagerAddress静态方法，用于获取JobManager的hostname及portHighAvailabilityServicesflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/highavailability/HighAvailabilityServices.java/* * The HighAvailabilityServices give access to all services needed for a highly-available * setup. In particular, the services provide access to highly available storage and * registries, as well as distributed counters and leader election. * * <ul> * <li>ResourceManager leader election and leader retrieval</li> * <li>JobManager leader election and leader retrieval</li> * <li>Persistence for checkpoint metadata</li> * <li>Registering the latest completed checkpoint(s)</li> * <li>Persistence for the BLOB store</li> * <li>Registry that marks a job’s status</li> * <li>Naming of RPC endpoints</li> * </ul> /public interface HighAvailabilityServices extends AutoCloseable { // ———————————————————————— // Constants // ———————————————————————— /* * This UUID should be used when no proper leader election happens, but a simple * pre-configured leader is used. That is for example the case in non-highly-available * standalone setups. / UUID DEFAULT_LEADER_ID = new UUID(0, 0); /* * This JobID should be used to identify the old JobManager when using the * {@link HighAvailabilityServices}. With the new mode every JobMaster will have a * distinct JobID assigned. / JobID DEFAULT_JOB_ID = new JobID(0L, 0L); // ———————————————————————— // Services // ———————————————————————— /* * Gets the leader retriever for the cluster’s resource manager. / LeaderRetrievalService getResourceManagerLeaderRetriever(); /* * Gets the leader retriever for the dispatcher. This leader retrieval service * is not always accessible. / LeaderRetrievalService getDispatcherLeaderRetriever(); /* * Gets the leader retriever for the job JobMaster which is responsible for the given job * * @param jobID The identifier of the job. * @return Leader retrieval service to retrieve the job manager for the given job * @deprecated This method should only be used by the legacy code where the JobManager acts as the master. / @Deprecated LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID); /* * Gets the leader retriever for the job JobMaster which is responsible for the given job * * @param jobID The identifier of the job. * @param defaultJobManagerAddress JobManager address which will be returned by * a static leader retrieval service. * @return Leader retrieval service to retrieve the job manager for the given job / LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID, String defaultJobManagerAddress); LeaderRetrievalService getWebMonitorLeaderRetriever(); /* * Gets the leader election service for the cluster’s resource manager. * * @return Leader election service for the resource manager leader election / LeaderElectionService getResourceManagerLeaderElectionService(); /* * Gets the leader election service for the cluster’s dispatcher. * * @return Leader election service for the dispatcher leader election / LeaderElectionService getDispatcherLeaderElectionService(); /* * Gets the leader election service for the given job. * * @param jobID The identifier of the job running the election. * @return Leader election service for the job manager leader election / LeaderElectionService getJobManagerLeaderElectionService(JobID jobID); LeaderElectionService getWebMonitorLeaderElectionService(); /* * Gets the checkpoint recovery factory for the job manager * * @return Checkpoint recovery factory / CheckpointRecoveryFactory getCheckpointRecoveryFactory(); /* * Gets the submitted job graph store for the job manager * * @return Submitted job graph store * @throws Exception if the submitted job graph store could not be created / SubmittedJobGraphStore getSubmittedJobGraphStore() throws Exception; /* * Gets the registry that holds information about whether jobs are currently running. * * @return Running job registry to retrieve running jobs / RunningJobsRegistry getRunningJobsRegistry() throws Exception; /* * Creates the BLOB store in which BLOBs are stored in a highly-available fashion. * * @return Blob store * @throws IOException if the blob store could not be created / BlobStore createBlobStore() throws IOException; // ———————————————————————— // Shutdown and Cleanup // ———————————————————————— /* * Closes the high availability services, releasing all resources. * * This method does not delete or clean up any data stored in external stores * (file systems, ZooKeeper, etc). Another instance of the high availability * services will be able to recover the job. * * If an exception occurs during closing services, this method will attempt to * continue closing other services and report exceptions only after all services * have been attempted to be closed. * * @throws Exception Thrown, if an exception occurred while closing these services. / @Override void close() throws Exception; /* * Closes the high availability services (releasing all resources) and deletes * all data stored by these services in external stores. * * After this method was called, the any job or session that was managed by * these high availability services will be unrecoverable. * * If an exception occurs during cleanup, this method will attempt to * continue the cleanup and report exceptions only after all cleanup steps have * been attempted. * * @throws Exception Thrown, if an exception occurred while closing these services * or cleaning up data stored by them. / void closeAndCleanupAllData() throws Exception;}HighAvailabilityServices定义了highly-available所需的各种services的get方法ZooKeeperHaServicesflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/highavailability/zookeeper/ZooKeeperHaServices.java/* * An implementation of the {@link HighAvailabilityServices} using Apache ZooKeeper. * The services store data in ZooKeeper’s nodes as illustrated by teh following tree structure: * * <pre> * /flink * +/cluster_id_1/resource_manager_lock * | | * | +/job-id-1/job_manager_lock * | | /checkpoints/latest * | | /latest-1 * | | /latest-2 * | | * | +/job-id-2/job_manager_lock * | * +/cluster_id_2/resource_manager_lock * | * +/job-id-1/job_manager_lock * |/checkpoints/latest * | /latest-1 * |/persisted_job_graph * </pre> * * The root path “/flink” is configurable via the option {@link HighAvailabilityOptions#HA_ZOOKEEPER_ROOT}. * This makes sure Flink stores its data under specific subtrees in ZooKeeper, for example to * accommodate specific permission. * * The “cluster_id” part identifies the data stored for a specific Flink “cluster”. * This “cluster” can be either a standalone or containerized Flink cluster, or it can be job * on a framework like YARN or Mesos (in a “per-job-cluster” mode). * * In case of a “per-job-cluster” on YARN or Mesos, the cluster-id is generated and configured * automatically by the client or dispatcher that submits the Job to YARN or Mesos. * * In the case of a standalone cluster, that cluster-id needs to be configured via * {@link HighAvailabilityOptions#HA_CLUSTER_ID}. All nodes with the same cluster id will join the same * cluster and participate in the execution of the same set of jobs. /public class ZooKeeperHaServices implements HighAvailabilityServices { private static final Logger LOG = LoggerFactory.getLogger(ZooKeeperHaServices.class); private static final String RESOURCE_MANAGER_LEADER_PATH = “/resource_manager_lock”; private static final String DISPATCHER_LEADER_PATH = “/dispatcher_lock”; private static final String JOB_MANAGER_LEADER_PATH = “/job_manager_lock”; private static final String REST_SERVER_LEADER_PATH = “/rest_server_lock”; // ———————————————————————— /* The ZooKeeper client to use / private final CuratorFramework client; /* The executor to run ZooKeeper callbacks on / private final Executor executor; /* The runtime configuration / private final Configuration configuration; /* The zookeeper based running jobs registry / private final RunningJobsRegistry runningJobsRegistry; /* Store for arbitrary blobs / private final BlobStoreService blobStoreService; public ZooKeeperHaServices( CuratorFramework client, Executor executor, Configuration configuration, BlobStoreService blobStoreService) { this.client = checkNotNull(client); this.executor = checkNotNull(executor); this.configuration = checkNotNull(configuration); this.runningJobsRegistry = new ZooKeeperRunningJobsRegistry(client, configuration); this.blobStoreService = checkNotNull(blobStoreService); } // ———————————————————————— // Services // ———————————————————————— @Override public LeaderRetrievalService getResourceManagerLeaderRetriever() { return ZooKeeperUtils.createLeaderRetrievalService(client, configuration, RESOURCE_MANAGER_LEADER_PATH); } @Override public LeaderRetrievalService getDispatcherLeaderRetriever() { return ZooKeeperUtils.createLeaderRetrievalService(client, configuration, DISPATCHER_LEADER_PATH); } @Override public LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID) { return ZooKeeperUtils.createLeaderRetrievalService(client, configuration, getPathForJobManager(jobID)); } @Override public LeaderRetrievalService getJobManagerLeaderRetriever(JobID jobID, String defaultJobManagerAddress) { return getJobManagerLeaderRetriever(jobID); } @Override public LeaderRetrievalService getWebMonitorLeaderRetriever() { return ZooKeeperUtils.createLeaderRetrievalService(client, configuration, REST_SERVER_LEADER_PATH); } @Override public LeaderElectionService getResourceManagerLeaderElectionService() { return ZooKeeperUtils.createLeaderElectionService(client, configuration, RESOURCE_MANAGER_LEADER_PATH); } @Override public LeaderElectionService getDispatcherLeaderElectionService() { return ZooKeeperUtils.createLeaderElectionService(client, configuration, DISPATCHER_LEADER_PATH); } @Override public LeaderElectionService getJobManagerLeaderElectionService(JobID jobID) { return ZooKeeperUtils.createLeaderElectionService(client, configuration, getPathForJobManager(jobID)); } @Override public LeaderElectionService getWebMonitorLeaderElectionService() { return ZooKeeperUtils.createLeaderElectionService(client, configuration, REST_SERVER_LEADER_PATH); } @Override public CheckpointRecoveryFactory getCheckpointRecoveryFactory() { return new ZooKeeperCheckpointRecoveryFactory(client, configuration, executor); } @Override public SubmittedJobGraphStore getSubmittedJobGraphStore() throws Exception { return ZooKeeperUtils.createSubmittedJobGraphs(client, configuration); } @Override public RunningJobsRegistry getRunningJobsRegistry() { return runningJobsRegistry; } @Override public BlobStore createBlobStore() throws IOException { return blobStoreService; } // ———————————————————————— // Shutdown // ———————————————————————— @Override public void close() throws Exception { Throwable exception = null; try { blobStoreService.close(); } catch (Throwable t) { exception = t; } internalClose(); if (exception != null) { ExceptionUtils.rethrowException(exception, “Could not properly close the ZooKeeperHaServices.”); } } @Override public void closeAndCleanupAllData() throws Exception { LOG.info(“Close and clean up all data for ZooKeeperHaServices.”); Throwable exception = null; try { blobStoreService.closeAndCleanupAllData(); } catch (Throwable t) { exception = t; } internalClose(); if (exception != null) { ExceptionUtils.rethrowException(exception, “Could not properly close and clean up all data of ZooKeeperHaServices.”); } } /* * Closes components which don’t distinguish between close and closeAndCleanupAllData / private void internalClose() { client.close(); } // ———————————————————————— // Utilities // ———————————————————————— private static String getPathForJobManager(final JobID jobID) { return “/” + jobID + JOB_MANAGER_LEADER_PATH; }}ZooKeeperHaServices实现了HighAvailabilityServices接口，它通过ZooKeeperUtils的各种create方法来创建所需的service，比如ZooKeeperUtils.createLeaderRetrievalService、ZooKeeperUtils.createLeaderElectionService、ZooKeeperUtils.createSubmittedJobGraphsJobClient.submitJobflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/client/JobClient.javapublic class JobClient { private static final Logger LOG = LoggerFactory.getLogger(JobClient.class); //…… /* * Submits a job to a Flink cluster (non-blocking) and returns a JobListeningContext which can be * passed to {@code awaitJobResult} to get the result of the submission. * @return JobListeningContext which may be used to retrieve the JobExecutionResult via * {@code awaitJobResult(JobListeningContext context)}. */ public static JobListeningContext submitJob( ActorSystem actorSystem, Configuration config, HighAvailabilityServices highAvailabilityServices, JobGraph jobGraph, FiniteDuration timeout, boolean sysoutLogUpdates, ClassLoader classLoader) { checkNotNull(actorSystem, “The actorSystem must not be null.”); checkNotNull(highAvailabilityServices, “The high availability services must not be null.”); checkNotNull(jobGraph, “The jobGraph must not be null.”); checkNotNull(timeout, “The timeout must not be null.”); // for this job, we create a proxy JobClientActor that deals with all communication with // the JobManager. It forwards the job submission, checks the success/failure responses, logs // update messages, watches for disconnect between client and JobManager, … Props jobClientActorProps = JobSubmissionClientActor.createActorProps( highAvailabilityServices.getJobManagerLeaderRetriever(HighAvailabilityServices.DEFAULT_JOB_ID), timeout, sysoutLogUpdates, config); ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps); Future<Object> submissionFuture = Patterns.ask( jobClientActor, new JobClientMessages.SubmitJobAndWait(jobGraph), new Timeout(AkkaUtils.INF_TIMEOUT())); return new JobListeningContext( jobGraph.getJobID(), submissionFuture, jobClientActor, timeout, classLoader, highAvailabilityServices); } //……}像JobClient.submitJob方法就使用到了highAvailabilityServices.getJobManagerLeaderRetriever方法来获取JobManagerLeader的地址，用于提交job小结HighAvailabilityMode有三个枚举，分别是NONE、ZOOKEEPER、FACTORY_CLASS；这些枚举有一个属性haActive，用于表示是否支持HighAvailability；HighAvailabilityOptions定义了前缀为high-availability.zookeeper的配置项HighAvailabilityServicesUtils提供了创建HighAvailabilityServices的静态方法，这些方法有createAvailableOrEmbeddedServices、createHighAvailabilityServices、createCustomHAServices；其中createAvailableOrEmbeddedServices方法主要是给FlinkMiniCluster使用；createHighAvailabilityServices方法主要是给ClusterEntrypoint使用，它在highAvailabilityMode为NONE的时候创建的是StandaloneHaServices，在highAvailabilityMode为ZOOKEEPER创建的是ZooKeeperHaServices，在highAvailabilityMode为FACTORY_CLASS的时候使用createCustomHAServices方法来创建HighAvailabilityServices定义了highly-available所需的各种services的get方法；ZooKeeperHaServices实现了HighAvailabilityServices接口，它通过ZooKeeperUtils的各种create方法来创建所需的service，比如ZooKeeperUtils.createLeaderRetrievalService、ZooKeeperUtils.createLeaderElectionService、ZooKeeperUtils.createSubmittedJobGraphs；像JobClient.submitJob方法就使用到了highAvailabilityServices.getJobManagerLeaderRetriever方法来获取JobManagerLeader的地址，用于提交jobdocJobManager High Availability (HA) ...

官宣！阿里Blink和Flink合并计划出炉

apache已公开合并计划，点击可阅读原文《Batch as a Special Case of Streaming and Alibaba’s contribution of Blink》，由AI前线进行了翻译。**春节前一周，经过社区内部讨论，阿里巴巴大数据引擎 Blink 作为 Flink 的分支正式开源。今天，Apache Flink 官方网站发文对 Blink 贡献回 Flink 项目的意义作进一步说明，并公布了 Blink 和 Flink 的合并计划。社区的合并计划最初会将重点放在有界 / 批处理功能上，社区将对 SQL/Table API 模块进行重组，将 Blink 查询规划器（优化器）和运行时（操作符）合并为当前 SQL 运行时的附加查询处理器。经过一段过渡期之后，将开发新的查询处理器，而当前的处理器很可能会被弃用。为了合并 Blink 的调度增强功能和有界数据的作业恢复功能，Flink 社区也在努力重构当前的调度功能。前不久，经社区讨论，阿里巴巴决定将 Blink 贡献回 Flink 项目。为什么说这对 Flink 来说是一件大事？这对 Flink 的用户和社区来说意味着什么？这与 Flink 的整体愿景有着怎样的关系？让我们退后一步，一探究竟。针对 Blink 的贡献形式，Flink 社区讨论邮件如下：https://lists.apache.org/thre…统一的批处理和流式处理方法从早期开始，Flink 就有意采用统一的批处理和流式处理方法。其核心构建块是“持续处理无界的数据流”：如果可以做到这一点，还可以离线处理有界数据集（批处理），因为有界数据集就是在某个时刻结束的数据流。很多项目（例如 Flink、Beam 等）都支持“流式处理优先，将批处理视为流式处理的特殊情况”的理念，这个理念也经常被认为是构建跨实时和离线数据应用程序的强大方式，可以大大降低数据基础设施的复杂性。为什么批处理器仍然存在？“批处理只是流式处理的一个特例”并不意味着所有的流式处理器都能用于批处理——流式处理器的出现并没有让批处理器变得过时：纯流式处理系统在批处理工作负载时其实是很慢的。没有人会认为使用流式处理器来分析海量数据是个好主意。像 Apache Beam 这样的统一 API 通常会根据数据是持续的（无界）还是固定的（有界）将工作负载委托给不同的运行时。Flink 提供了一个流式 API，可以处理有界和无界的场景，同时仍然提供了单独的 DataSet API 和运行时用于批处理，因为速度会更快。那么“批处理只是流式处理的一个特例”这种想法出了什么问题？其实这种范式并没有错。统一批处理和流式处理 API 只是一个方面，我们还需要利用“有界数据”这个特殊情况的某些特征来应对批处理用例。毕竟，批处理器就是专门为这种特殊情况而准备的。建立在流式运行时之上的批处理我们始终认为，同时拥有一个可用于流式处理和批处理的运行时是可能的。一个流式处理优先的运行时也可以利用有界数据流的特殊属性进行快速的批处理，就像批处理器那样。而这就是 Flink 所采用的方法。Flink 包含了一个网络栈，支持低延迟 / 高吞吐的流式数据交换和高吞吐的批次 shuffle。它还提供了很多流式运行时操作符，也为有界输入提供了专门的操作符，如果你选择了 DataSet API 或 Table API，就可以使用这些操作符。因此，Flink 实际上在早期就已经展示出了一些令人印象深刻的批处理性能。下面的基准测试有点旧了，但在早期很好地验证了我们的架构方法。排序 3.2TB（80GB/ 节点）数据所使用的时间（以秒为单位）还差些什么？为了总结这个方法，并让 Flink 在有界数据（批处理）方面达到最新的水平，我们需要做出更多的增强。我们认为下面这些特性是实现我们愿景的关键：真正统一的运行时操作符栈：目前，有界和无界操作符具有不同的网络和线程模型，不会混在一起，也不匹配。最初是因为批处理操作符遵循的是“拉取模型”（为了方便批处理算法），而流式操作符遵循的是“推模型”（可以获得更好的延迟 / 吞吐量）。在统一的操作符栈中，持续流式操作符是基础。在操作有界数据时，如果没有延迟方面的约束，API 或查询优化器可以从更大的操作符集中选择合适的操作符。例如，优化器可以选择一个特殊的连接操作符，先完全读取第一个输入流，然后再读取第二个输入流。利用有界数据流来减小容错范围：如果输入数据是有界的，可以在 shuffle（内存或磁盘）期间缓冲数据，并在发生故障后重放数据。这样可以实现更细粒度的故障恢复，也更有效。利用有界数据流操作符的属性进行调度：持续无界的流式应用程序需要同时运行所有操作符。基于有界数据的应用程序可以根据其中一个操作符如何消费数据（例如，先构建哈希表，再探测哈希表）来调度另一个操作符。这样做可以提高资源效率。为 DataStream API 启用这些特殊优化：目前只有 Table API 在处理有界数据时激活了这些优化。SQL 的性能和覆盖范围：SQL 是事实上的标准数据语言，虽然它被用在持续流式处理种，但并不适用于有界 / 批处理的情况。为了与最佳批处理引擎展开竞争，Flink 需要提升 SQL 查询执行覆盖率和性能。虽然 Flink 的核心数据平面具有很高的性能，但 SQL 执行的速度在很大程度上取决于优化器规则、丰富的操作符和代码生成，等等。现在来说说 BlinkBlink 是 Flink 的一个分支，最初在阿里巴巴内部创建的，针对内部用例对 Flink 进行改进。Blink 添加了一系列改进和集成（https://github.com/apache/fli… ），其中有很多与有界数据 / 批处理和 SQL 有关。实际上，在上面的功能列表中，除了第 4 项外，Blink 在其他方面都迈出了重要的一步：统一的流式操作符：Blink 扩展了 Flink 的流式运行时操作符模型，支持选择性读取不同的输入源，同时保持推送模型的低延迟特性。这种对输入源的选择性读取可以更好地支持一些算法（例如相同操作符的混合散列连接）和线程模型（通过 RocksDB 的连续对称连接）。这些操作符为“侧边输入”（https://cwiki.apache.org/conf… ）等新功能打下了基础。Table API 和 SQL 查询处理器：与最新的 Flink 主分支相比，SQL 查询处理器是演变得最多的一个组件：Flink 目前将查询转换为 DataSet 或 DataStream 程序（取决于输入的特性），而 Blink 会将查询转换为上述流式操作符的数据流。Blink 为常见的 SQL 操作添加了更多的运行时操作符，如半连接（semi-join）、反连接（anti-join）等。查询规划器（优化器）仍然是基于 Apache Calcite，但提供了更多的优化规则（包括连接重排序），并且使用了适当的成本模型。更加积极的流式操作符链接。扩展通用数据结构（分类器、哈希表）和序列化器，在操作二进制数据上更进一步，并减小了序列化开销。代码生成被用于行序列化器。改进的调度和故障恢复：最后，Blink 实现了对任务调度和容错的若干改进。调度策略通过利用操作符处理输入数据的方式来更好地使用资源。故障转移策略沿着持久 shuffle 的边界进行更细粒度的恢复。不需重新启动正在运行的应用程序就可以替换发生故障的 JobManager。Blink 的变化带来了大幅度的性能提升。以下数据由 Blink 开发者提供，给出了性能提升的粗略情况。在 TPC-H 基准测试中，Blink 与 Flink 1.6.0 的相对性能。Blink 性能平均提升 10 倍在 TPC-DS 基准测试中，Blink 与 Spark 的性能，将所有查询的总时间汇总在一起。Blink 和 Flink 的合并计划Blink 的代码目前已经作为 Flink 代码库的一个分支（https://github.com/apache/fli… ）对外开放。合并这么多变更是一项艰巨的挑战，同时还要尽可能保持合并过程不要造成任何中断，并使公共 API 尽可能保持稳定。社区的合并计划最初将重点放在上述的有界 / 批处理功能上，并遵循以下方法以确保能够顺利集成：为了合并 Blink 的 SQL/Table API 查询处理器增强功能，我们利用了 Flink 和 Blink 都具有相同 API 的事实：SQL 和 Table API。在对 Table/SQL 模块（https://cwiki.apache.org/conf…）进行一些重组之后，我们计划将 Blink 查询规划器（优化器）和运行时（操作符）合并为当前 SQL 运行时的附加查询处理器。可以将其视为同一 API 的两个不同的运行器。最开始，可以让用户选择要使用哪个查询处理器。经过一个过渡期之后，将开发新的查询处理器，而当前的处理器很可能会被弃用，并最终被丢弃。因为 SQL 是一个定义良好的接口，我们预计这种转换对用户来说几乎没有影响。为了合并 Blink 的调度增强功能和有界数据的作业恢复功能，Flink 社区已经在努力重构当前的调度功能，并添加对可插拔调度和故障转移策略的支持。在完成这项工作后，我们就可以将 Blink 的调度和恢复策略作为新查询处理器的调度策略。最后，我们计划将新的调度策略应用于有界 DataStream 程序。扩展的目录支持、DDL 支持以及对 Hive 目录和集成的支持目前正在进行单独的设计讨论。总结我们相信未来的数据处理技术栈会以流式处理为基础：流式处理的优雅，能够以相同的方式对离线处理（批处理）、实时数据处理和事件驱动的应用程序进行建模，同时还能提供高性能和一致性，这些实在是太吸引人了。要让流式处理器实现与专用批处理器相同的性能，利用有界数据的某些属性是关键。Flink 支持批处理，但它的下一步是要构建统一的运行时，并成为一个可以与批处理系统相竞争的流式处理器。阿里巴巴贡献的 Blink 有助于 Flink 社区加快实现这一目标。本文作者：云学习小组阅读原文本文为云栖社区原创内容，未经允许不得转载。 ...

聊聊flink的ParameterTool

序本文主要研究一下flink的ParameterTool实例fromPropertiesFileString propertiesFilePath = “/home/sam/flink/myjob.properties”;ParameterTool parameter = ParameterTool.fromPropertiesFile(propertiesFilePath);File propertiesFile = new File(propertiesFilePath);ParameterTool parameter = ParameterTool.fromPropertiesFile(propertiesFile);InputStream propertiesFileInputStream = new FileInputStream(file);ParameterTool parameter = ParameterTool.fromPropertiesFile(propertiesFileInputStream);使用ParameterTool.fromPropertiesFile从.properties文件创建ParameterToolfromArgspublic static void main(String[] args) { ParameterTool parameter = ParameterTool.fromArgs(args); // .. regular code ..}使用ParameterTool.fromArgs从命令行创建ParameterTool(比如–input hdfs:///mydata –elements 42)fromSystemPropertiesParameterTool parameter = ParameterTool.fromSystemProperties();使用ParameterTool.fromSystemProperties从system properties创建ParameterTool(比如-Dinput=hdfs:///mydata)获取参数值ParameterTool parameters = // …parameter.getRequired(“input”);parameter.get(“output”, “myDefaultValue”);parameter.getLong(“expectedCount”, -1L);parameter.getNumberOfParameters()// .. there are more methods available.可以使用ParameterTool的get、getRequired、getLong等方法获取参数值设置为globalenv.getConfig().setGlobalJobParameters(parameters);public static final class Tokenizer extends RichFlatMapFunction<String, Tuple2<String, Integer>> { @Override public void flatMap(String value, Collector<Tuple2<String, Integer>> out) { ParameterTool parameters = (ParameterTool) getRuntimeContext().getExecutionConfig().getGlobalJobParameters(); parameters.getRequired(“input”); // … do more …}使用env.getConfig().setGlobalJobParameters将ParameterTool的访问范围设置为globalGlobalJobParametersflink-core-1.7.1-sources.jar!/org/apache/flink/api/common/ExecutionConfig.java public static class GlobalJobParameters implements Serializable { private static final long serialVersionUID = 1L; /** * Convert UserConfig into a {@code Map<String, String>} representation. * This can be used by the runtime, for example for presenting the user config in the web frontend. * * @return Key/Value representation of the UserConfig / public Map<String, String> toMap() { return Collections.emptyMap(); } }GlobalJobParameters里头有一个toMap方法，返回Collections.emptyMap()ParameterToolflink-java-1.7.1-sources.jar!/org/apache/flink/api/java/utils/ParameterTool.java@Publicpublic class ParameterTool extends ExecutionConfig.GlobalJobParameters implements Serializable, Cloneable { private static final long serialVersionUID = 1L; protected static final String NO_VALUE_KEY = “__NO_VALUE_KEY”; protected static final String DEFAULT_UNDEFINED = “<undefined>”; //…… // —————— ParameterUtil ———————— protected final Map<String, String> data; // data which is only used on the client and does not need to be transmitted protected transient Map<String, String> defaultData; protected transient Set<String> unrequestedParameters; private ParameterTool(Map<String, String> data) { this.data = Collections.unmodifiableMap(new HashMap<>(data)); this.defaultData = new ConcurrentHashMap<>(data.size()); this.unrequestedParameters = Collections.newSetFromMap(new ConcurrentHashMap<>(data.size())); unrequestedParameters.addAll(data.keySet()); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } ParameterTool that = (ParameterTool) o; return Objects.equals(data, that.data) && Objects.equals(defaultData, that.defaultData) && Objects.equals(unrequestedParameters, that.unrequestedParameters); } @Override public int hashCode() { return Objects.hash(data, defaultData, unrequestedParameters); } @Override public Map<String, String> toMap() { return data; } //…… /* * Returns {@link ParameterTool} for the given arguments. The arguments are keys followed by values. * Keys have to start with ‘-’ or ‘–’ * * Example arguments: * –key1 value1 –key2 value2 -key3 value3 * * @param args Input array arguments * @return A {@link ParameterTool} / public static ParameterTool fromArgs(String[] args) { final Map<String, String> map = new HashMap<>(args.length / 2); int i = 0; while (i < args.length) { final String key; if (args[i].startsWith("–")) { key = args[i].substring(2); } else if (args[i].startsWith("-")) { key = args[i].substring(1); } else { throw new IllegalArgumentException( String.format(“Error parsing arguments ‘%s’ on ‘%s’. Please prefix keys with – or -.”, Arrays.toString(args), args[i])); } if (key.isEmpty()) { throw new IllegalArgumentException( “The input " + Arrays.toString(args) + " contains an empty argument”); } i += 1; // try to find the value if (i >= args.length) { map.put(key, NO_VALUE_KEY); } else if (NumberUtils.isNumber(args[i])) { map.put(key, args[i]); i += 1; } else if (args[i].startsWith("–") || args[i].startsWith("-")) { // the argument cannot be a negative number because we checked earlier // -> the next argument is a parameter name map.put(key, NO_VALUE_KEY); } else { map.put(key, args[i]); i += 1; } } return fromMap(map); } /* * Returns {@link ParameterTool} for the given {@link Properties} file. * * @param path Path to the properties file * @return A {@link ParameterTool} * @throws IOException If the file does not exist * @see Properties / public static ParameterTool fromPropertiesFile(String path) throws IOException { File propertiesFile = new File(path); return fromPropertiesFile(propertiesFile); } /* * Returns {@link ParameterTool} for the given {@link Properties} file. * * @param file File object to the properties file * @return A {@link ParameterTool} * @throws IOException If the file does not exist * @see Properties / public static ParameterTool fromPropertiesFile(File file) throws IOException { if (!file.exists()) { throw new FileNotFoundException(“Properties file " + file.getAbsolutePath() + " does not exist”); } try (FileInputStream fis = new FileInputStream(file)) { return fromPropertiesFile(fis); } } /* * Returns {@link ParameterTool} for the given InputStream from {@link Properties} file. * * @param inputStream InputStream from the properties file * @return A {@link ParameterTool} * @throws IOException If the file does not exist * @see Properties / public static ParameterTool fromPropertiesFile(InputStream inputStream) throws IOException { Properties props = new Properties(); props.load(inputStream); return fromMap((Map) props); } /* * Returns {@link ParameterTool} for the given map. * * @param map A map of arguments. Both Key and Value have to be Strings * @return A {@link ParameterTool} / public static ParameterTool fromMap(Map<String, String> map) { Preconditions.checkNotNull(map, “Unable to initialize from empty map”); return new ParameterTool(map); } /* * Returns {@link ParameterTool} from the system properties. * Example on how to pass system properties: * -Dkey1=value1 -Dkey2=value2 * * @return A {@link ParameterTool} / public static ParameterTool fromSystemProperties() { return fromMap((Map) System.getProperties()); } //…… /* * Returns the String value for the given key. * If the key does not exist it will return null. / public String get(String key) { addToDefaults(key, null); unrequestedParameters.remove(key); return data.get(key); } /* * Returns the String value for the given key. * If the key does not exist it will throw a {@link RuntimeException}. / public String getRequired(String key) { addToDefaults(key, null); String value = get(key); if (value == null) { throw new RuntimeException(“No data for required key ‘” + key + “’”); } return value; } /* * Returns the String value for the given key. * If the key does not exist it will return the given default value. / public String get(String key, String defaultValue) { addToDefaults(key, defaultValue); String value = get(key); if (value == null) { return defaultValue; } else { return value; } } /* * Check if value is set. / public boolean has(String value) { addToDefaults(value, null); unrequestedParameters.remove(value); return data.containsKey(value); } // ————– Integer /* * Returns the Integer value for the given key. * The method fails if the key does not exist or the value is not an Integer. / public int getInt(String key) { addToDefaults(key, null); String value = getRequired(key); return Integer.parseInt(value); } /* * Returns the Integer value for the given key. If the key does not exists it will return the default value given. * The method fails if the value is not an Integer. / public int getInt(String key, int defaultValue) { addToDefaults(key, Integer.toString(defaultValue)); String value = get(key); if (value == null) { return defaultValue; } return Integer.parseInt(value); } // ————– LONG /* * Returns the Long value for the given key. * The method fails if the key does not exist. / public long getLong(String key) { addToDefaults(key, null); String value = getRequired(key); return Long.parseLong(value); } /* * Returns the Long value for the given key. If the key does not exists it will return the default value given. * The method fails if the value is not a Long. / public long getLong(String key, long defaultValue) { addToDefaults(key, Long.toString(defaultValue)); String value = get(key); if (value == null) { return defaultValue; } return Long.parseLong(value); } // ————– FLOAT /* * Returns the Float value for the given key. * The method fails if the key does not exist. / public float getFloat(String key) { addToDefaults(key, null); String value = getRequired(key); return Float.valueOf(value); } /* * Returns the Float value for the given key. If the key does not exists it will return the default value given. * The method fails if the value is not a Float. / public float getFloat(String key, float defaultValue) { addToDefaults(key, Float.toString(defaultValue)); String value = get(key); if (value == null) { return defaultValue; } else { return Float.valueOf(value); } } // ————– DOUBLE /* * Returns the Double value for the given key. * The method fails if the key does not exist. / public double getDouble(String key) { addToDefaults(key, null); String value = getRequired(key); return Double.valueOf(value); } /* * Returns the Double value for the given key. If the key does not exists it will return the default value given. * The method fails if the value is not a Double. / public double getDouble(String key, double defaultValue) { addToDefaults(key, Double.toString(defaultValue)); String value = get(key); if (value == null) { return defaultValue; } else { return Double.valueOf(value); } } // ————– BOOLEAN /* * Returns the Boolean value for the given key. * The method fails if the key does not exist. / public boolean getBoolean(String key) { addToDefaults(key, null); String value = getRequired(key); return Boolean.valueOf(value); } /* * Returns the Boolean value for the given key. If the key does not exists it will return the default value given. * The method returns whether the string of the value is “true” ignoring cases. / public boolean getBoolean(String key, boolean defaultValue) { addToDefaults(key, Boolean.toString(defaultValue)); String value = get(key); if (value == null) { return defaultValue; } else { return Boolean.valueOf(value); } } // ————– SHORT /* * Returns the Short value for the given key. * The method fails if the key does not exist. / public short getShort(String key) { addToDefaults(key, null); String value = getRequired(key); return Short.valueOf(value); } /* * Returns the Short value for the given key. If the key does not exists it will return the default value given. * The method fails if the value is not a Short. / public short getShort(String key, short defaultValue) { addToDefaults(key, Short.toString(defaultValue)); String value = get(key); if (value == null) { return defaultValue; } else { return Short.valueOf(value); } } // ————– BYTE /* * Returns the Byte value for the given key. * The method fails if the key does not exist. / public byte getByte(String key) { addToDefaults(key, null); String value = getRequired(key); return Byte.valueOf(value); } /* * Returns the Byte value for the given key. If the key does not exists it will return the default value given. * The method fails if the value is not a Byte. */ public byte getByte(String key, byte defaultValue) { addToDefaults(key, Byte.toString(defaultValue)); String value = get(key); if (value == null) { return defaultValue; } else { return Byte.valueOf(value); } } //……}ParameterTool里头有data、defaultData、unrequestedParameters等属性，toMap方法返回的是data属性ParameterTool提供了fromPropertiesFile、fromArgs、fromSystemProperties、fromMap静态方法用于创建ParameterToolParameterTool提供了get、getRequired、getInt、getLong、getFloat、getDouble、getBoolean、getShort、getByte等方法，每种类型的get均提供了一个支持defaultValue的方法小结ParameterTool提供了fromPropertiesFile、fromArgs、fromSystemProperties、fromMap静态方法用于创建ParameterToolParameterTool提供了get、getRequired、getInt、getLong、getFloat、getDouble、getBoolean、getShort、getByte等方法，每种类型的get均提供了一个支持defaultValue的方法ParameterTool继承了ExecutionConfig.GlobalJobParameters，其toMap方法返回的是data属性；使用env.getConfig().setGlobalJobParameters可以将ParameterTool的访问范围设置为globaldocParsing command line arguments and passing them around in your Flink application ...

聊聊flink的logback配置

序本文主要研究一下flink的logback配置client端pom文件配置<dependencies> <!– Add the two required logback dependencies –> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-core</artifactId> <version>1.2.3</version> </dependency> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-classic</artifactId> <version>1.2.3</version> </dependency> <!– Add the log4j -> sfl4j (-> logback) bridge into the classpath Hadoop is logging to log4j! –> <dependency> <groupId>org.slf4j</groupId> <artifactId>log4j-over-slf4j</artifactId> <version>1.7.15</version> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-java</artifactId> <version>1.7.1</version> <exclusions> <exclusion> <groupId>log4j</groupId> <artifactId></artifactId> </exclusion> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-streaming-java_2.11</artifactId> <version>1.7.1</version> <exclusions> <exclusion> <groupId>log4j</groupId> <artifactId></artifactId> </exclusion> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-clients_2.11</artifactId> <version>1.7.1</version> <exclusions> <exclusion> <groupId>log4j</groupId> <artifactId></artifactId> </exclusion> <exclusion> <groupId>org.slf4j</groupId> <artifactId>slf4j-log4j12</artifactId> </exclusion> </exclusions> </dependency></dependencies>添加logback-core、logback-classic及log4j-over-slf4j依赖，之后对flink-java、flink-streaming-java_2.11、flink-clients_2.11等配置log4j及slf4j-log4j12的exclusions；最后通过mvn dependency:tree查看是否还有log4j12，以确认下是否都全部排除了服务端配置添加logback-classic.jar、logback-core.jar、log4j-over-slf4j.jar到flink的lib目录下(比如/opt/flink/lib)移除flink的lib目录下(比如/opt/flink/lib)log4j及slf4j-log4j12的jar(比如log4j-1.2.17.jar及slf4j-log4j12-1.7.15.jar)如果要自定义logback的配置的话，可以覆盖flink的conf目录下的logback.xml、logback-console.xml或者logback-yarn.xmlflink-daemon.shflink-release-1.7.1/flink-dist/src/main/flink-bin/bin/flink-daemon.sh#!/usr/bin/env bash################################################################################# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# “License”); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an “AS IS” BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.################################################################################# Start/stop a Flink daemon.USAGE=“Usage: flink-daemon.sh (start|stop|stop-all) (taskexecutor|zookeeper|historyserver|standalonesession|standalonejob) [args]“STARTSTOP=$1DAEMON=$2ARGS=("${@:3}”) # get remaining arguments as arraybin=dirname "$0"bin=cd "$bin"; pwd. “$bin”/config.shcase $DAEMON in (taskexecutor) CLASS_TO_RUN=org.apache.flink.runtime.taskexecutor.TaskManagerRunner ;; (zookeeper) CLASS_TO_RUN=org.apache.flink.runtime.zookeeper.FlinkZooKeeperQuorumPeer ;; (historyserver) CLASS_TO_RUN=org.apache.flink.runtime.webmonitor.history.HistoryServer ;; (standalonesession) CLASS_TO_RUN=org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint ;; (standalonejob) CLASS_TO_RUN=org.apache.flink.container.entrypoint.StandaloneJobClusterEntryPoint ;; () echo “Unknown daemon ‘${DAEMON}’. $USAGE.” exit 1 ;;esacif [ “$FLINK_IDENT_STRING” = "” ]; then FLINK_IDENT_STRING="$USER"fiFLINK_TM_CLASSPATH=constructFlinkClassPathpid=$FLINK_PID_DIR/flink-$FLINK_IDENT_STRING-$DAEMON.pidmkdir -p “$FLINK_PID_DIR”# Log files for daemons are indexed from the process ID’s position in the PID# file. The following lock prevents a race condition during daemon startup# when multiple daemons read, index, and write to the PID file concurrently.# The lock is created on the PID directory since a lock file cannot be safely# removed. The daemon is started with the lock closed and the lock remains# active in this script until the script exits.command -v flock >/dev/null 2>&1if [[ $? -eq 0 ]]; then exec 200<"$FLINK_PID_DIR" flock 200fi# Ascending ID depending on number of lines in pid file.# This allows us to start multiple daemon of each type.id=$([ -f “$pid” ] && echo $(wc -l < “$pid”) || echo “0”)FLINK_LOG_PREFIX="${FLINK_LOG_DIR}/flink-${FLINK_IDENT_STRING}-${DAEMON}-${id}-${HOSTNAME}“log="${FLINK_LOG_PREFIX}.log"out="${FLINK_LOG_PREFIX}.out"log_setting=("-Dlog.file=${log}” “-Dlog4j.configuration=file:${FLINK_CONF_DIR}/log4j.properties” “-Dlogback.configurationFile=file:${FLINK_CONF_DIR}/logback.xml”)JAVA_VERSION=$(${JAVA_RUN} -version 2>&1 | sed ’s/.version “(.).(.)..”/\1\2/; 1q’)# Only set JVM 8 arguments if we have correctly extracted the versionif [[ ${JAVA_VERSION} =~ ${IS_NUMBER} ]]; then if [ “$JAVA_VERSION” -lt 18 ]; then JVM_ARGS="$JVM_ARGS -XX:MaxPermSize=256m" fificase $STARTSTOP in (start) # Rotate log files rotateLogFilesWithPrefix “$FLINK_LOG_DIR” “$FLINK_LOG_PREFIX” # Print a warning if daemons are already running on host if [ -f “$pid” ]; then active=() while IFS=’’ read -r p || [[ -n “$p” ]]; do kill -0 $p >/dev/null 2>&1 if [ $? -eq 0 ]; then active+=($p) fi done < “${pid}” count="${#active[@]}" if [ ${count} -gt 0 ]; then echo “[INFO] $count instance(s) of $DAEMON are already running on $HOSTNAME.” fi fi # Evaluate user options for local variable expansion FLINK_ENV_JAVA_OPTS=$(eval echo ${FLINK_ENV_JAVA_OPTS}) echo “Starting $DAEMON daemon on host $HOSTNAME.” $JAVA_RUN $JVM_ARGS ${FLINK_ENV_JAVA_OPTS} “${log_setting[@]}” -classpath “manglePathList "$FLINK_TM_CLASSPATH:$INTERNAL_HADOOP_CLASSPATHS"” ${CLASS_TO_RUN} “${ARGS[@]}” > “$out” 200<&- 2>&1 < /dev/null & mypid=$! # Add to pid file if successful start if [[ ${mypid} =~ ${IS_NUMBER} ]] && kill -0 $mypid > /dev/null 2>&1 ; then echo $mypid >> “$pid” else echo “Error starting $DAEMON daemon.” exit 1 fi ;; (stop) if [ -f “$pid” ]; then # Remove last in pid file to_stop=$(tail -n 1 “$pid”) if [ -z $to_stop ]; then rm “$pid” # If all stopped, clean up pid file echo “No $DAEMON daemon to stop on host $HOSTNAME.” else sed $d “$pid” > “$pid.tmp” # all but last line # If all stopped, clean up pid file [ $(wc -l < “$pid.tmp”) -eq 0 ] && rm “$pid” “$pid.tmp” || mv “$pid.tmp” “$pid” if kill -0 $to_stop > /dev/null 2>&1; then echo “Stopping $DAEMON daemon (pid: $to_stop) on host $HOSTNAME.” kill $to_stop else echo “No $DAEMON daemon (pid: $to_stop) is running anymore on $HOSTNAME.” fi fi else echo “No $DAEMON daemon to stop on host $HOSTNAME.” fi ;; (stop-all) if [ -f “$pid” ]; then mv “$pid” “${pid}.tmp” while read to_stop; do if kill -0 $to_stop > /dev/null 2>&1; then echo “Stopping $DAEMON daemon (pid: $to_stop) on host $HOSTNAME.” kill $to_stop else echo “Skipping $DAEMON daemon (pid: $to_stop), because it is not running anymore on $HOSTNAME.” fi done < “${pid}.tmp” rm “${pid}.tmp” fi ;; () echo “Unexpected argument ‘$STARTSTOP’. $USAGE.” exit 1 ;;esac使用flink-daemon.sh启动的flink使用的logback配置文件是logback.xmlflink-console.shflink-release-1.7.1/flink-dist/src/main/flink-bin/bin/flink-console.sh#!/usr/bin/env bash################################################################################# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# “License”); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an “AS IS” BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.################################################################################# Start a Flink service as a console application. Must be stopped with Ctrl-C# or with SIGTERM by kill or the controlling process.USAGE=“Usage: flink-console.sh (taskexecutor|zookeeper|historyserver|standalonesession|standalonejob) [args]“SERVICE=$1ARGS=("${@:2}”) # get remaining arguments as arraybin=dirname "$0"bin=cd "$bin"; pwd. “$bin”/config.shcase $SERVICE in (taskexecutor) CLASS_TO_RUN=org.apache.flink.runtime.taskexecutor.TaskManagerRunner ;; (historyserver) CLASS_TO_RUN=org.apache.flink.runtime.webmonitor.history.HistoryServer ;; (zookeeper) CLASS_TO_RUN=org.apache.flink.runtime.zookeeper.FlinkZooKeeperQuorumPeer ;; (standalonesession) CLASS_TO_RUN=org.apache.flink.runtime.entrypoint.StandaloneSessionClusterEntrypoint ;; (standalonejob) CLASS_TO_RUN=org.apache.flink.container.entrypoint.StandaloneJobClusterEntryPoint ;; () echo “Unknown service ‘${SERVICE}’. $USAGE.” exit 1 ;;esacFLINK_TM_CLASSPATH=constructFlinkClassPathlog_setting=("-Dlog4j.configuration=file:${FLINK_CONF_DIR}/log4j-console.properties” “-Dlogback.configurationFile=file:${FLINK_CONF_DIR}/logback-console.xml”)JAVA_VERSION=$(${JAVA_RUN} -version 2>&1 | sed ’s/.version “(.).(.)..”/\1\2/; 1q’)# Only set JVM 8 arguments if we have correctly extracted the versionif [[ ${JAVA_VERSION} =~ ${IS_NUMBER} ]]; then if [ “$JAVA_VERSION” -lt 18 ]; then JVM_ARGS="$JVM_ARGS -XX:MaxPermSize=256m" fifiecho “Starting $SERVICE as a console application on host $HOSTNAME.“exec $JAVA_RUN $JVM_ARGS ${FLINK_ENV_JAVA_OPTS} “${log_setting[@]}” -classpath “manglePathList "$FLINK_TM_CLASSPATH:$INTERNAL_HADOOP_CLASSPATHS"” ${CLASS_TO_RUN} “${ARGS[@]}“使用flink-console.sh启动的flink使用的logback配置文件是logback-console.xmlyarn-session.shflink-release-1.7.1/flink-dist/src/main/flink-bin/yarn-bin/yarn-session.sh#!/usr/bin/env bash################################################################################# Licensed to the Apache Software Foundation (ASF) under one# or more contributor license agreements. See the NOTICE file# distributed with this work for additional information# regarding copyright ownership. The ASF licenses this file# to you under the Apache License, Version 2.0 (the# “License”); you may not use this file except in compliance# with the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an “AS IS” BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.################################################################################bin=dirname "$0"bin=cd "$bin"; pwd# get Flink config. “$bin”/config.shif [ “$FLINK_IDENT_STRING” = "” ]; then FLINK_IDENT_STRING="$USER"fiJVM_ARGS="$JVM_ARGS -Xmx512m"CC_CLASSPATH=manglePathList $(constructFlinkClassPath):$INTERNAL_HADOOP_CLASSPATHSlog=$FLINK_LOG_DIR/flink-$FLINK_IDENT_STRING-yarn-session-$HOSTNAME.loglog_setting="-Dlog.file="$log” -Dlog4j.configuration=file:"$FLINK_CONF_DIR”/log4j-yarn-session.properties -Dlogback.configurationFile=file:"$FLINK_CONF_DIR”/logback-yarn.xml"export FLINK_CONF_DIR$JAVA_RUN $JVM_ARGS -classpath “$CC_CLASSPATH” $log_setting org.apache.flink.yarn.cli.FlinkYarnSessionCli -j “$FLINK_LIB_DIR”/flink-dist*.jar “$@“使用yarn-session.sh启动的flink使用的logback配置文件是logback-yarn.xml小结client端使用logback的话，要在pom文件添加logback-core、logback-classic及log4j-over-slf4j依赖，之后对flink-java、flink-streaming-java_2.11、flink-clients_2.11等配置log4j及slf4j-log4j12的exclusions；最后通过mvn dependency:tree查看是否还有log4j12，以确认下是否都全部排除了服务端使用logback的话，要在添加logback-classic.jar、logback-core.jar、log4j-over-slf4j.jar到flink的lib目录下(比如/opt/flink/lib)；移除flink的lib目录下(比如/opt/flink/lib)log4j及slf4j-log4j12的jar(比如log4j-1.2.17.jar及slf4j-log4j12-1.7.15.jar)；如果要自定义logback的配置的话，可以覆盖flink的conf目录下的logback.xml、logback-console.xml或者logback-yarn.xml使用flink-daemon.sh启动的flink使用的logback配置文件是logback.xml；使用flink-console.sh启动的flink使用的logback配置文件是logback-console.xml；使用yarn-session.sh启动的flink使用的logback配置文件是logback-yarn.xmldocUsing Logback instead of Log4j ...

聊聊flink的Execution Plan Visualization

序本文主要研究一下flink的Execution Plan Visualization实例代码 @Test public void testExecutionPlan(){ final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStream<Tuple2<String,Integer>> dataStream = env.fromElements(WORDS) .flatMap(new WordCountTest.Tokenizer()) .keyBy(0) .sum(1); dataStream.print(); System.out.println(env.getExecutionPlan()); }json{ “nodes”: [ { “id”: 1, “type”: “Source: Collection Source”, “pact”: “Data Source”, “contents”: “Source: Collection Source”, “parallelism”: 1 }, { “id”: 2, “type”: “Flat Map”, “pact”: “Operator”, “contents”: “Flat Map”, “parallelism”: 4, “predecessors”: [ { “id”: 1, “ship_strategy”: “REBALANCE”, “side”: “second” } ] }, { “id”: 4, “type”: “Keyed Aggregation”, “pact”: “Operator”, “contents”: “Keyed Aggregation”, “parallelism”: 4, “predecessors”: [ { “id”: 2, “ship_strategy”: “HASH”, “side”: “second” } ] }, { “id”: 5, “type”: “Sink: Print to Std. Out”, “pact”: “Data Sink”, “contents”: “Sink: Print to Std. Out”, “parallelism”: 4, “predecessors”: [ { “id”: 4, “ship_strategy”: “FORWARD”, “side”: “second” } ] } ]}可视化打开flink plan visualizer将上面的json，输入到文本框，点击Draw进行可视化如下：StreamExecutionEnvironment.getExecutionPlanflink-streaming-java_2.11-1.7.1-sources.jar!/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java@Publicpublic abstract class StreamExecutionEnvironment { //…… /** * Creates the plan with which the system will execute the program, and * returns it as a String using a JSON representation of the execution data * flow graph. Note that this needs to be called, before the plan is * executed. * * @return The execution plan of the program, as a JSON String. / public String getExecutionPlan() { return getStreamGraph().getStreamingPlanAsJSON(); } /* * Getter of the {@link org.apache.flink.streaming.api.graph.StreamGraph} of the streaming job. * * @return The streamgraph representing the transformations */ @Internal public StreamGraph getStreamGraph() { if (transformations.size() <= 0) { throw new IllegalStateException(“No operators defined in streaming topology. Cannot execute.”); } return StreamGraphGenerator.generate(this, transformations); } //……}StreamExecutionEnvironment的getExecutionPlan方法调用了getStreamGraph方法；getStreamGraph方法使用StreamGraphGenerator.generate生成了StreamGraph；之后就是调用StreamGraph.getStreamingPlanAsJSON来获取json格式的execution planStreamGraph.getStreamingPlanAsJSONflink-streaming-java_2.11-1.7.1-sources.jar!/org/apache/flink/streaming/api/graph/StreamGraph.java@Internalpublic class StreamGraph extends StreamingPlan { private static final Logger LOG = LoggerFactory.getLogger(StreamGraph.class); private String jobName = StreamExecutionEnvironment.DEFAULT_JOB_NAME; private final StreamExecutionEnvironment environment; private final ExecutionConfig executionConfig; private final CheckpointConfig checkpointConfig; private boolean chaining; private Map<Integer, StreamNode> streamNodes; private Set<Integer> sources; private Set<Integer> sinks; private Map<Integer, Tuple2<Integer, List<String>>> virtualSelectNodes; private Map<Integer, Tuple2<Integer, OutputTag>> virtualSideOutputNodes; private Map<Integer, Tuple2<Integer, StreamPartitioner<?>>> virtualPartitionNodes; protected Map<Integer, String> vertexIDtoBrokerID; protected Map<Integer, Long> vertexIDtoLoopTimeout; private StateBackend stateBackend; private Set<Tuple2<StreamNode, StreamNode>> iterationSourceSinkPairs; //…… public String getStreamingPlanAsJSON() { try { return new JSONGenerator(this).getJSON(); } catch (Exception e) { throw new RuntimeException(“JSON plan creation failed”, e); } } //……}StreamGraph的getStreamingPlanAsJSON方法使用JSONGenerator来序列化自己，返回json格式的execution plan小结flink提供了flink plan visualizer的在线地址，用于进行execution plan的可视化，它接收json形式的execution planStreamExecutionEnvironment的getExecutionPlan方法调用了getStreamGraph方法；getStreamGraph方法使用StreamGraphGenerator.generate生成了StreamGraphStreamGraph的getStreamingPlanAsJSON方法使用JSONGenerator来序列化自己，返回json格式的execution plandocExecution Plansflink plan visualizer ...

聊聊flink的Parallel Execution

序本文主要研究一下flink的Parallel Execution实例Operator Levelfinal StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();DataStream<String> text = […]DataStream<Tuple2<String, Integer>> wordCounts = text .flatMap(new LineSplitter()) .keyBy(0) .timeWindow(Time.seconds(5)) .sum(1).setParallelism(5);wordCounts.print();env.execute(“Word Count Example”);operators、data sources、data sinks都可以调用setParallelism()方法来设置parallelismExecution Environment Levelfinal StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setParallelism(3);DataStream<String> text = […]DataStream<Tuple2<String, Integer>> wordCounts = […]wordCounts.print();env.execute(“Word Count Example”);在ExecutionEnvironment里头可以通过setParallelism来给operators、data sources、data sinks设置默认的parallelism；如果operators、data sources、data sinks自己有设置parallelism则会覆盖ExecutionEnvironment设置的parallelismClient Level./bin/flink run -p 10 ../examples/WordCount-java.jar或者try { PackagedProgram program = new PackagedProgram(file, args); InetSocketAddress jobManagerAddress = RemoteExecutor.getInetFromHostport(“localhost:6123”); Configuration config = new Configuration(); Client client = new Client(jobManagerAddress, config, program.getUserCodeClassLoader()); // set the parallelism to 10 here client.run(program, 10, true);} catch (ProgramInvocationException e) { e.printStackTrace();}使用CLI client，可以在命令行调用是用-p来指定，或者Java/Scala调用时在Client.run的参数中指定parallelismSystem Level# The parallelism used for programs that did not specify and other parallelism.parallelism.default: 1可以在flink-conf.yaml中通过parallelism.default配置项给所有execution environments指定系统级的默认parallelismExecutionEnvironmentflink-java-1.7.1-sources.jar!/org/apache/flink/api/java/ExecutionEnvironment.java@Publicpublic abstract class ExecutionEnvironment { //…… private final ExecutionConfig config = new ExecutionConfig(); /** * Sets the parallelism for operations executed through this environment. * Setting a parallelism of x here will cause all operators (such as join, map, reduce) to run with * x parallel instances. * * This method overrides the default parallelism for this environment. * The {@link LocalEnvironment} uses by default a value equal to the number of hardware * contexts (CPU cores / threads). When executing the program via the command line client * from a JAR file, the default parallelism is the one configured for that setup. * * @param parallelism The parallelism / public void setParallelism(int parallelism) { config.setParallelism(parallelism); } @Internal public Plan createProgramPlan(String jobName, boolean clearSinks) { if (this.sinks.isEmpty()) { if (wasExecuted) { throw new RuntimeException(“No new data sinks have been defined since the " + “last execution. The last execution refers to the latest call to " + “’execute()’, ‘count()’, ‘collect()’, or ‘print()’.”); } else { throw new RuntimeException(“No data sinks have been created yet. " + “A program needs at least one sink that consumes data. " + “Examples are writing the data set or printing it.”); } } if (jobName == null) { jobName = getDefaultName(); } OperatorTranslation translator = new OperatorTranslation(); Plan plan = translator.translateToPlan(this.sinks, jobName); if (getParallelism() > 0) { plan.setDefaultParallelism(getParallelism()); } plan.setExecutionConfig(getConfig()); // Check plan for GenericTypeInfo’s and register the types at the serializers. if (!config.isAutoTypeRegistrationDisabled()) { plan.accept(new Visitor<org.apache.flink.api.common.operators.Operator<?>>() { private final Set<Class<?>> registeredTypes = new HashSet<>(); private final Set<org.apache.flink.api.common.operators.Operator<?>> visitedOperators = new HashSet<>(); @Override public boolean preVisit(org.apache.flink.api.common.operators.Operator<?> visitable) { if (!visitedOperators.add(visitable)) { return false; } OperatorInformation<?> opInfo = visitable.getOperatorInfo(); Serializers.recursivelyRegisterType(opInfo.getOutputType(), config, registeredTypes); return true; } @Override public void postVisit(org.apache.flink.api.common.operators.Operator<?> visitable) {} }); } try { registerCachedFilesWithPlan(plan); } catch (Exception e) { throw new RuntimeException(“Error while registering cached files: " + e.getMessage(), e); } // clear all the sinks such that the next execution does not redo everything if (clearSinks) { this.sinks.clear(); wasExecuted = true; } // All types are registered now. Print information. int registeredTypes = config.getRegisteredKryoTypes().size() + config.getRegisteredPojoTypes().size() + config.getRegisteredTypesWithKryoSerializerClasses().size() + config.getRegisteredTypesWithKryoSerializers().size(); int defaultKryoSerializers = config.getDefaultKryoSerializers().size() + config.getDefaultKryoSerializerClasses().size(); LOG.info(“The job has {} registered types and {} default Kryo serializers”, registeredTypes, defaultKryoSerializers); if (config.isForceKryoEnabled() && config.isForceAvroEnabled()) { LOG.warn(“In the ExecutionConfig, both Avro and Kryo are enforced. Using Kryo serializer”); } if (config.isForceKryoEnabled()) { LOG.info(“Using KryoSerializer for serializing POJOs”); } if (config.isForceAvroEnabled()) { LOG.info(“Using AvroSerializer for serializing POJOs”); } if (LOG.isDebugEnabled()) { LOG.debug(“Registered Kryo types: {}”, config.getRegisteredKryoTypes().toString()); LOG.debug(“Registered Kryo with Serializers types: {}”, config.getRegisteredTypesWithKryoSerializers().entrySet().toString()); LOG.debug(“Registered Kryo with Serializer Classes types: {}”, config.getRegisteredTypesWithKryoSerializerClasses().entrySet().toString()); LOG.debug(“Registered Kryo default Serializers: {}”, config.getDefaultKryoSerializers().entrySet().toString()); LOG.debug(“Registered Kryo default Serializers Classes {}”, config.getDefaultKryoSerializerClasses().entrySet().toString()); LOG.debug(“Registered POJO types: {}”, config.getRegisteredPojoTypes().toString()); // print information about static code analysis LOG.debug(“Static code analysis mode: {}”, config.getCodeAnalysisMode()); } return plan; } //……}ExecutionEnvironment提供了setParallelism方法，给ExecutionConfig指定parallelism；最后createProgramPlan方法创建Plan后会读取ExecutionConfig的parallelism，给Plan设置defaultParallelismLocalEnvironmentflink-java-1.7.1-sources.jar!/org/apache/flink/api/java/LocalEnvironment.java@Publicpublic class LocalEnvironment extends ExecutionEnvironment { //…… public JobExecutionResult execute(String jobName) throws Exception { if (executor == null) { startNewSession(); } Plan p = createProgramPlan(jobName); // Session management is disabled, revert this commit to enable //p.setJobId(jobID); //p.setSessionTimeout(sessionTimeout); JobExecutionResult result = executor.executePlan(p); this.lastJobExecutionResult = result; return result; } //……}LocalEnvironment的execute调用的是LocalExecutor的executePlanLocalExecutorflink-clients_2.11-1.7.1-sources.jar!/org/apache/flink/client/LocalExecutor.javapublic class LocalExecutor extends PlanExecutor { //…… @Override public JobExecutionResult executePlan(Plan plan) throws Exception { if (plan == null) { throw new IllegalArgumentException(“The plan may not be null.”); } synchronized (this.lock) { // check if we start a session dedicated for this execution final boolean shutDownAtEnd; if (jobExecutorService == null) { shutDownAtEnd = true; // configure the number of local slots equal to the parallelism of the local plan if (this.taskManagerNumSlots == DEFAULT_TASK_MANAGER_NUM_SLOTS) { int maxParallelism = plan.getMaximumParallelism(); if (maxParallelism > 0) { this.taskManagerNumSlots = maxParallelism; } } // start the cluster for us start(); } else { // we use the existing session shutDownAtEnd = false; } try { // TODO: Set job’s default parallelism to max number of slots final int slotsPerTaskManager = jobExecutorServiceConfiguration.getInteger(TaskManagerOptions.NUM_TASK_SLOTS, taskManagerNumSlots); final int numTaskManagers = jobExecutorServiceConfiguration.getInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1); plan.setDefaultParallelism(slotsPerTaskManager * numTaskManagers); Optimizer pc = new Optimizer(new DataStatistics(), jobExecutorServiceConfiguration); OptimizedPlan op = pc.compile(plan); JobGraphGenerator jgg = new JobGraphGenerator(jobExecutorServiceConfiguration); JobGraph jobGraph = jgg.compileJobGraph(op, plan.getJobId()); return jobExecutorService.executeJobBlocking(jobGraph); } finally { if (shutDownAtEnd) { stop(); } } } } //……}LocalExecutor的executePlan方法还会根据slotsPerTaskManager及numTaskManagers对plan设置defaultParallelismRemoteEnvironmentflink-java-1.7.1-sources.jar!/org/apache/flink/api/java/RemoteEnvironment.java@Publicpublic class RemoteEnvironment extends ExecutionEnvironment { //…… public JobExecutionResult execute(String jobName) throws Exception { PlanExecutor executor = getExecutor(); Plan p = createProgramPlan(jobName); // Session management is disabled, revert this commit to enable //p.setJobId(jobID); //p.setSessionTimeout(sessionTimeout); JobExecutionResult result = executor.executePlan(p); this.lastJobExecutionResult = result; return result; } //……}RemoteEnvironment的execute调用的是RemoteExecutor的executePlanRemoteExecutorflink-clients_2.11-1.7.1-sources.jar!/org/apache/flink/client/RemoteExecutor.javapublic class RemoteExecutor extends PlanExecutor { private final Object lock = new Object(); private final List<URL> jarFiles; private final List<URL> globalClasspaths; private final Configuration clientConfiguration; private ClusterClient<?> client; //…… @Override public JobExecutionResult executePlan(Plan plan) throws Exception { if (plan == null) { throw new IllegalArgumentException(“The plan may not be null.”); } JobWithJars p = new JobWithJars(plan, this.jarFiles, this.globalClasspaths); return executePlanWithJars(p); } public JobExecutionResult executePlanWithJars(JobWithJars program) throws Exception { if (program == null) { throw new IllegalArgumentException(“The job may not be null.”); } synchronized (this.lock) { // check if we start a session dedicated for this execution final boolean shutDownAtEnd; if (client == null) { shutDownAtEnd = true; // start the executor for us start(); } else { // we use the existing session shutDownAtEnd = false; } try { return client.run(program, defaultParallelism).getJobExecutionResult(); } finally { if (shutDownAtEnd) { stop(); } } } } //……}RemoteExecutor的executePlan调用了executePlanWithJars方法，而后者则调用了ClusterClient的run，并在参数中指定了defaultParallelismClusterClientflink-clients_2.11-1.7.1-sources.jar!/org/apache/flink/client/program/ClusterClient.javapublic abstract class ClusterClient<T> { //…… public JobSubmissionResult run(JobWithJars program, int parallelism) throws ProgramInvocationException { return run(program, parallelism, SavepointRestoreSettings.none()); } public JobSubmissionResult run(JobWithJars jobWithJars, int parallelism, SavepointRestoreSettings savepointSettings) throws CompilerException, ProgramInvocationException { ClassLoader classLoader = jobWithJars.getUserCodeClassLoader(); if (classLoader == null) { throw new IllegalArgumentException(“The given JobWithJars does not provide a usercode class loader.”); } OptimizedPlan optPlan = getOptimizedPlan(compiler, jobWithJars, parallelism); return run(optPlan, jobWithJars.getJarFiles(), jobWithJars.getClasspaths(), classLoader, savepointSettings); } private static OptimizedPlan getOptimizedPlan(Optimizer compiler, JobWithJars prog, int parallelism) throws CompilerException, ProgramInvocationException { return getOptimizedPlan(compiler, prog.getPlan(), parallelism); } public static OptimizedPlan getOptimizedPlan(Optimizer compiler, Plan p, int parallelism) throws CompilerException { Logger log = LoggerFactory.getLogger(ClusterClient.class); if (parallelism > 0 && p.getDefaultParallelism() <= 0) { log.debug(“Changing plan default parallelism from {} to {}”, p.getDefaultParallelism(), parallelism); p.setDefaultParallelism(parallelism); } log.debug(“Set parallelism {}, plan default parallelism {}”, parallelism, p.getDefaultParallelism()); return compiler.compile(p); } //……}ClusterClient的run方法中的parallelism在parallelism > 0以及p.getDefaultParallelism() <= 0的时候会作用到Plan中DataStreamSourceflink-streaming-java_2.11-1.7.1-sources.jar!/org/apache/flink/streaming/api/datastream/DataStreamSource.java@Publicpublic class DataStreamSource<T> extends SingleOutputStreamOperator<T> { boolean isParallel; public DataStreamSource(StreamExecutionEnvironment environment, TypeInformation<T> outTypeInfo, StreamSource<T, ?> operator, boolean isParallel, String sourceName) { super(environment, new SourceTransformation<>(sourceName, operator, outTypeInfo, environment.getParallelism())); this.isParallel = isParallel; if (!isParallel) { setParallelism(1); } } public DataStreamSource(SingleOutputStreamOperator<T> operator) { super(operator.environment, operator.getTransformation()); this.isParallel = true; } @Override public DataStreamSource<T> setParallelism(int parallelism) { if (parallelism != 1 && !isParallel) { throw new IllegalArgumentException(“Source: " + transformation.getId() + " is not a parallel source”); } else { super.setParallelism(parallelism); return this; } }}DataStreamSource继承了SingleOutputStreamOperator，它提供了setParallelism方法，最终调用的是父类SingleOutputStreamOperator的setParallelismSingleOutputStreamOperatorflink-streaming-java_2.11-1.7.1-sources.jar!/org/apache/flink/streaming/api/datastream/SingleOutputStreamOperator.java@Publicpublic class SingleOutputStreamOperator<T> extends DataStream<T> { //…… /* * Sets the parallelism for this operator. * * @param parallelism * The parallelism for this operator. * @return The operator with set parallelism. / public SingleOutputStreamOperator<T> setParallelism(int parallelism) { Preconditions.checkArgument(canBeParallel() || parallelism == 1, “The parallelism of non parallel operator must be 1.”); transformation.setParallelism(parallelism); return this; } //……}SingleOutputStreamOperator的setParallelism最后是作用到StreamTransformationDataStreamSinkflink-streaming-java_2.11-1.7.1-sources.jar!/org/apache/flink/streaming/api/datastream/DataStreamSink.java@Publicpublic class DataStreamSink<T> { private final SinkTransformation<T> transformation; //…… /* * Sets the parallelism for this sink. The degree must be higher than zero. * * @param parallelism The parallelism for this sink. * @return The sink with set parallelism. */ public DataStreamSink<T> setParallelism(int parallelism) { transformation.setParallelism(parallelism); return this; } //……}DataStreamSink提供了setParallelism方法，最后是作用于SinkTransformation小结flink可以设置好几个level的parallelism，其中包括Operator Level、Execution Environment Level、Client Level、System Level在flink-conf.yaml中通过parallelism.default配置项给所有execution environments指定系统级的默认parallelism；在ExecutionEnvironment里头可以通过setParallelism来给operators、data sources、data sinks设置默认的parallelism；如果operators、data sources、data sinks自己有设置parallelism则会覆盖ExecutionEnvironment设置的parallelismExecutionEnvironment提供的setParallelism方法用于给ExecutionConfig指定parallelism(如果使用CLI client，可以在命令行调用是用-p来指定，或者Java/Scala调用时在Client.run的参数中指定parallelism；LocalEnvironment及RemoteEnvironment设置的parallelism最后都是设置到Plan中)；DataStreamSource继承了SingleOutputStreamOperator，它提供了setParallelism方法，最终调用的是父类SingleOutputStreamOperator的setParallelism；SingleOutputStreamOperator的setParallelism最后是作用到StreamTransformation；DataStreamSink提供了setParallelism方法，最后是作用于SinkTransformationdocParallel Execution ...

聊聊flink的RestartStrategies

序本文主要研究一下flink的RestartStrategiesRestartStrategiesflink-core-1.7.1-sources.jar!/org/apache/flink/api/common/restartstrategy/RestartStrategies.java@PublicEvolvingpublic class RestartStrategies { /** * Generates NoRestartStrategyConfiguration. * * @return NoRestartStrategyConfiguration / public static RestartStrategyConfiguration noRestart() { return new NoRestartStrategyConfiguration(); } public static RestartStrategyConfiguration fallBackRestart() { return new FallbackRestartStrategyConfiguration(); } /* * Generates a FixedDelayRestartStrategyConfiguration. * * @param restartAttempts Number of restart attempts for the FixedDelayRestartStrategy * @param delayBetweenAttempts Delay in-between restart attempts for the FixedDelayRestartStrategy * @return FixedDelayRestartStrategy / public static RestartStrategyConfiguration fixedDelayRestart(int restartAttempts, long delayBetweenAttempts) { return fixedDelayRestart(restartAttempts, Time.of(delayBetweenAttempts, TimeUnit.MILLISECONDS)); } /* * Generates a FixedDelayRestartStrategyConfiguration. * * @param restartAttempts Number of restart attempts for the FixedDelayRestartStrategy * @param delayInterval Delay in-between restart attempts for the FixedDelayRestartStrategy * @return FixedDelayRestartStrategy / public static RestartStrategyConfiguration fixedDelayRestart(int restartAttempts, Time delayInterval) { return new FixedDelayRestartStrategyConfiguration(restartAttempts, delayInterval); } /* * Generates a FailureRateRestartStrategyConfiguration. * * @param failureRate Maximum number of restarts in given interval {@code failureInterval} before failing a job * @param failureInterval Time interval for failures * @param delayInterval Delay in-between restart attempts / public static FailureRateRestartStrategyConfiguration failureRateRestart( int failureRate, Time failureInterval, Time delayInterval) { return new FailureRateRestartStrategyConfiguration(failureRate, failureInterval, delayInterval); } //……}RestartStrategies提供了noRestart、fallBackRestart、fixedDelayRestart、failureRateRestart静态方法用于构建RestartStrategyConfigurationRestartStrategyConfigurationflink-core-1.7.1-sources.jar!/org/apache/flink/api/common/restartstrategy/RestartStrategies.java public abstract static class RestartStrategyConfiguration implements Serializable { private static final long serialVersionUID = 6285853591578313960L; private RestartStrategyConfiguration() {} /* * Returns a description which is shown in the web interface. * * @return Description of the restart strategy / public abstract String getDescription(); }RestartStrategyConfiguration是个抽象类，它定义了getDescription抽象方法，它有NoRestartStrategyConfiguration、FixedDelayRestartStrategyConfiguration、FailureRateRestartStrategyConfiguration、FallbackRestartStrategyConfiguration这几个子类NoRestartStrategyConfigurationflink-core-1.7.1-sources.jar!/org/apache/flink/api/common/restartstrategy/RestartStrategies.java public static final class NoRestartStrategyConfiguration extends RestartStrategyConfiguration { private static final long serialVersionUID = -5894362702943349962L; @Override public String getDescription() { return “Restart deactivated.”; } @Override public boolean equals(Object o) { if (this == o) { return true; } return o instanceof NoRestartStrategyConfiguration; } @Override public int hashCode() { return Objects.hash(); } }NoRestartStrategyConfiguration继承了RestartStrategyConfiguration，它代表no restart strategyFixedDelayRestartStrategyConfigurationflink-core-1.7.1-sources.jar!/org/apache/flink/api/common/restartstrategy/RestartStrategies.java public static final class FixedDelayRestartStrategyConfiguration extends RestartStrategyConfiguration { private static final long serialVersionUID = 4149870149673363190L; private final int restartAttempts; private final Time delayBetweenAttemptsInterval; FixedDelayRestartStrategyConfiguration(int restartAttempts, Time delayBetweenAttemptsInterval) { this.restartAttempts = restartAttempts; this.delayBetweenAttemptsInterval = delayBetweenAttemptsInterval; } public int getRestartAttempts() { return restartAttempts; } public Time getDelayBetweenAttemptsInterval() { return delayBetweenAttemptsInterval; } @Override public int hashCode() { int result = restartAttempts; result = 31 * result + (delayBetweenAttemptsInterval != null ? delayBetweenAttemptsInterval.hashCode() : 0); return result; } @Override public boolean equals(Object obj) { if (obj instanceof FixedDelayRestartStrategyConfiguration) { FixedDelayRestartStrategyConfiguration other = (FixedDelayRestartStrategyConfiguration) obj; return restartAttempts == other.restartAttempts && delayBetweenAttemptsInterval.equals(other.delayBetweenAttemptsInterval); } else { return false; } } @Override public String getDescription() { return “Restart with fixed delay (” + delayBetweenAttemptsInterval + “). #” + restartAttempts + " restart attempts."; } }FixedDelayRestartStrategyConfiguration继承了RestartStrategyConfiguration，它代表fixed delay restart strategy，它有restartAttempts及delayBetweenAttemptsInterval两个属性FailureRateRestartStrategyConfigurationflink-core-1.7.1-sources.jar!/org/apache/flink/api/common/restartstrategy/RestartStrategies.java public static final class FailureRateRestartStrategyConfiguration extends RestartStrategyConfiguration { private static final long serialVersionUID = 1195028697539661739L; private final int maxFailureRate; private final Time failureInterval; private final Time delayBetweenAttemptsInterval; public FailureRateRestartStrategyConfiguration(int maxFailureRate, Time failureInterval, Time delayBetweenAttemptsInterval) { this.maxFailureRate = maxFailureRate; this.failureInterval = failureInterval; this.delayBetweenAttemptsInterval = delayBetweenAttemptsInterval; } public int getMaxFailureRate() { return maxFailureRate; } public Time getFailureInterval() { return failureInterval; } public Time getDelayBetweenAttemptsInterval() { return delayBetweenAttemptsInterval; } @Override public String getDescription() { return “Failure rate restart with maximum of " + maxFailureRate + " failures within interval " + failureInterval.toString() + " and fixed delay " + delayBetweenAttemptsInterval.toString(); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } FailureRateRestartStrategyConfiguration that = (FailureRateRestartStrategyConfiguration) o; return maxFailureRate == that.maxFailureRate && Objects.equals(failureInterval, that.failureInterval) && Objects.equals(delayBetweenAttemptsInterval, that.delayBetweenAttemptsInterval); } @Override public int hashCode() { return Objects.hash(maxFailureRate, failureInterval, delayBetweenAttemptsInterval); } }FailureRateRestartStrategyConfiguration继承了RestartStrategyConfiguration，它代表failure rate restart strategy，它有maxFailureRate、failureInterval、delayBetweenAttemptsInterval三个属性FallbackRestartStrategyConfigurationflink-core-1.7.1-sources.jar!/org/apache/flink/api/common/restartstrategy/RestartStrategies.java public static final class FallbackRestartStrategyConfiguration extends RestartStrategyConfiguration { private static final long serialVersionUID = -4441787204284085544L; @Override public String getDescription() { return “Cluster level default restart strategy”; } @Override public boolean equals(Object o) { if (this == o) { return true; } return o instanceof FallbackRestartStrategyConfiguration; } @Override public int hashCode() { return Objects.hash(); } }FallbackRestartStrategyConfiguration继承了RestartStrategyConfiguration，它代表Cluster level default restart strategyRestartStrategyResolvingflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/executiongraph/restart/RestartStrategyResolving.javapublic final class RestartStrategyResolving { /* * Resolves which {@link RestartStrategy} to use. It should be used only on the server side. * The resolving strategy is as follows: * <ol> * <li>Strategy set within job graph.</li> * <li>Strategy set flink-conf.yaml on the server set, unless is set to {@link NoRestartStrategy} and checkpointing * is enabled.</li> * <li>If no strategy was set on client and server side and checkpointing was enabled then * {@link FixedDelayRestartStrategy} is used</li> * </ol> * * @param clientConfiguration restart configuration given within the job graph * @param serverStrategyFactory default server side strategy factory * @param isCheckpointingEnabled if checkpointing was enabled for the job * @return resolved strategy / public static RestartStrategy resolve( RestartStrategies.RestartStrategyConfiguration clientConfiguration, RestartStrategyFactory serverStrategyFactory, boolean isCheckpointingEnabled) { final RestartStrategy clientSideRestartStrategy = RestartStrategyFactory.createRestartStrategy(clientConfiguration); if (clientSideRestartStrategy != null) { return clientSideRestartStrategy; } else { if (serverStrategyFactory instanceof NoOrFixedIfCheckpointingEnabledRestartStrategyFactory) { return ((NoOrFixedIfCheckpointingEnabledRestartStrategyFactory) serverStrategyFactory) .createRestartStrategy(isCheckpointingEnabled); } else { return serverStrategyFactory.createRestartStrategy(); } } } private RestartStrategyResolving() { }}RestartStrategyResolving提供了一个静态方法resolve，用于解析RestartStrategies.RestartStrategyConfiguration，然后使用RestartStrategyFactory创建RestartStrategyRestartStrategyflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/executiongraph/restart/RestartStrategy.javapublic interface RestartStrategy { /* * True if the restart strategy can be applied to restart the {@link ExecutionGraph}. * * @return true if restart is possible, otherwise false / boolean canRestart(); /* * Called by the ExecutionGraph to eventually trigger a full recovery. * The recovery must be triggered on the given callback object, and may be delayed * with the help of the given scheduled executor. * * The thread that calls this method is not supposed to block/sleep. * * @param restarter The hook to restart the ExecutionGraph * @param executor An scheduled executor to delay the restart / void restart(RestartCallback restarter, ScheduledExecutor executor);}RestartStrategy定义了canRestart及restart两个方法，它有NoRestartStrategy、FixedDelayRestartStrategy、FailureRateRestartStrategy这几个子类NoRestartStrategyflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/executiongraph/restart/NoRestartStrategy.javapublic class NoRestartStrategy implements RestartStrategy { @Override public boolean canRestart() { return false; } @Override public void restart(RestartCallback restarter, ScheduledExecutor executor) { throw new UnsupportedOperationException(“NoRestartStrategy does not support restart.”); } /* * Creates a NoRestartStrategyFactory instance. * * @param configuration Configuration object which is ignored * @return NoRestartStrategyFactory instance / public static NoRestartStrategyFactory createFactory(Configuration configuration) { return new NoRestartStrategyFactory(); } @Override public String toString() { return “NoRestartStrategy”; } public static class NoRestartStrategyFactory extends RestartStrategyFactory { private static final long serialVersionUID = -1809462525812787862L; @Override public RestartStrategy createRestartStrategy() { return new NoRestartStrategy(); } }}NoRestartStrategy实现了RestartStrategy接口，它的canRestart方法返回false，restart方法抛出UnsupportedOperationExceptionFixedDelayRestartStrategyflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/executiongraph/restart/FixedDelayRestartStrategy.javapublic class FixedDelayRestartStrategy implements RestartStrategy { private final int maxNumberRestartAttempts; private final long delayBetweenRestartAttempts; private int currentRestartAttempt; public FixedDelayRestartStrategy( int maxNumberRestartAttempts, long delayBetweenRestartAttempts) { Preconditions.checkArgument(maxNumberRestartAttempts >= 0, “Maximum number of restart attempts must be positive.”); Preconditions.checkArgument(delayBetweenRestartAttempts >= 0, “Delay between restart attempts must be positive”); this.maxNumberRestartAttempts = maxNumberRestartAttempts; this.delayBetweenRestartAttempts = delayBetweenRestartAttempts; currentRestartAttempt = 0; } public int getCurrentRestartAttempt() { return currentRestartAttempt; } @Override public boolean canRestart() { return currentRestartAttempt < maxNumberRestartAttempts; } @Override public void restart(final RestartCallback restarter, ScheduledExecutor executor) { currentRestartAttempt++; executor.schedule(new Runnable() { @Override public void run() { restarter.triggerFullRecovery(); } }, delayBetweenRestartAttempts, TimeUnit.MILLISECONDS); } /* * Creates a FixedDelayRestartStrategy from the given Configuration. * * @param configuration Configuration containing the parameter values for the restart strategy * @return Initialized instance of FixedDelayRestartStrategy * @throws Exception */ public static FixedDelayRestartStrategyFactory createFactory(Configuration configuration) throws Exception { int maxAttempts = configuration.getInteger(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_ATTEMPTS, 1); String delayString = configuration.getString(ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY); long delay; try { delay = Duration.apply(delayString).toMillis(); } catch (NumberFormatException nfe) { throw new Exception(“Invalid config value for " + ConfigConstants.RESTART_STRATEGY_FIXED_DELAY_DELAY + “: " + delayString + “. Value must be a valid duration (such as ‘100 milli’ or ‘10 s’)”); } return new FixedDelayRestartStrategyFactory(maxAttempts, delay); } @Override public String toString() { return “FixedDelayRestartStrategy(” + “maxNumberRestartAttempts=” + maxNumberRestartAttempts + “, delayBetweenRestartAttempts=” + delayBetweenRestartAttempts + ‘)’; } public static class FixedDelayRestartStrategyFactory extends RestartStrategyFactory { private static final long serialVersionUID = 6642934067762271950L; private final int maxAttempts; private final long delay; public FixedDelayRestartStrategyFactory(int maxAttempts, long delay) { this.maxAttempts = maxAttempts; this.delay = delay; } @Override public RestartStrategy createRestartStrategy() { return new FixedDelayRestartStrategy(maxAttempts, delay); } }}FixedDelayRestartStrategy实现了RestartStrategy接口，它的canRestart方法依据currentRestartAttempt及maxNumberRestartAttempts来判断；restart方法则直接调用ScheduledExecutor.schedule方法，延时delayBetweenRestartAttempts毫秒执行RestartCallback.triggerFullRecovery()FailureRateRestartStrategyflink-runtime_2.11-1.7.1-sources.jar!/org/apache/flink/runtime/executiongraph/restart/FailureRateRestartStrategy.javapublic class FailureRateRestartStrategy implements RestartStrategy { private final Time failuresInterval; private final Time delayInterval; private final int maxFailuresPerInterval; private final ArrayDeque<Long> restartTimestampsDeque; public FailureRateRestartStrategy(int maxFailuresPerInterval, Time failuresInterval, Time delayInterval) { Preconditions.checkNotNull(failuresInterval, “Failures interval cannot be null.”); Preconditions.checkNotNull(delayInterval, “Delay interval cannot be null.”); Preconditions.checkArgument(maxFailuresPerInterval > 0, “Maximum number of restart attempts per time unit must be greater than 0.”); Preconditions.checkArgument(failuresInterval.getSize() > 0, “Failures interval must be greater than 0 ms.”); Preconditions.checkArgument(delayInterval.getSize() >= 0, “Delay interval must be at least 0 ms.”); this.failuresInterval = failuresInterval; this.delayInterval = delayInterval; this.maxFailuresPerInterval = maxFailuresPerInterval; this.restartTimestampsDeque = new ArrayDeque<>(maxFailuresPerInterval); } @Override public boolean canRestart() { if (isRestartTimestampsQueueFull()) { Long now = System.currentTimeMillis(); Long earliestFailure = restartTimestampsDeque.peek(); return (now - earliestFailure) > failuresInterval.toMilliseconds(); } else { return true; } } @Override public void restart(final RestartCallback restarter, ScheduledExecutor executor) { if (isRestartTimestampsQueueFull()) { restartTimestampsDeque.remove(); } restartTimestampsDeque.add(System.currentTimeMillis()); executor.schedule(new Runnable() { @Override public void run() { restarter.triggerFullRecovery(); } }, delayInterval.getSize(), delayInterval.getUnit()); } private boolean isRestartTimestampsQueueFull() { return restartTimestampsDeque.size() >= maxFailuresPerInterval; } @Override public String toString() { return “FailureRateRestartStrategy(” + “failuresInterval=” + failuresInterval + “delayInterval=” + delayInterval + “maxFailuresPerInterval=” + maxFailuresPerInterval + “)”; } public static FailureRateRestartStrategyFactory createFactory(Configuration configuration) throws Exception { int maxFailuresPerInterval = configuration.getInteger(ConfigConstants.RESTART_STRATEGY_FAILURE_RATE_MAX_FAILURES_PER_INTERVAL, 1); String failuresIntervalString = configuration.getString( ConfigConstants.RESTART_STRATEGY_FAILURE_RATE_FAILURE_RATE_INTERVAL, Duration.apply(1, TimeUnit.MINUTES).toString() ); String timeoutString = configuration.getString(AkkaOptions.WATCH_HEARTBEAT_INTERVAL); String delayString = configuration.getString(ConfigConstants.RESTART_STRATEGY_FAILURE_RATE_DELAY, timeoutString); Duration failuresInterval = Duration.apply(failuresIntervalString); Duration delay = Duration.apply(delayString); return new FailureRateRestartStrategyFactory(maxFailuresPerInterval, Time.milliseconds(failuresInterval.toMillis()), Time.milliseconds(delay.toMillis())); } public static class FailureRateRestartStrategyFactory extends RestartStrategyFactory { private static final long serialVersionUID = -373724639430960480L; private final int maxFailuresPerInterval; private final Time failuresInterval; private final Time delayInterval; public FailureRateRestartStrategyFactory(int maxFailuresPerInterval, Time failuresInterval, Time delayInterval) { this.maxFailuresPerInterval = maxFailuresPerInterval; this.failuresInterval = Preconditions.checkNotNull(failuresInterval); this.delayInterval = Preconditions.checkNotNull(delayInterval); } @Override public RestartStrategy createRestartStrategy() { return new FailureRateRestartStrategy(maxFailuresPerInterval, failuresInterval, delayInterval); } }}FailureRateRestartStrategy实现了RestartStrategy接口，它的canRestart方法在restartTimestampsDeque队列大小小于maxFailuresPerInterval时返回true，大于等于maxFailuresPerInterval时则判断当前时间距离earliestFailure是否大于failuresInterval；restart方法则往restartTimestampsDeque添加当前时间，然后调用ScheduledExecutor.schedule方法，延时delayInterval执行RestartCallback.triggerFullRecovery()小结RestartStrategies提供了noRestart、fallBackRestart、fixedDelayRestart、failureRateRestart静态方法用于构建RestartStrategyConfigurationRestartStrategyConfiguration是个抽象类，它定义了getDescription抽象方法，它有NoRestartStrategyConfiguration、FixedDelayRestartStrategyConfiguration、FailureRateRestartStrategyConfiguration、FallbackRestartStrategyConfiguration这几个子类RestartStrategyResolving提供了一个静态方法resolve，用于解析RestartStrategies.RestartStrategyConfiguration，然后使用RestartStrategyFactory创建RestartStrategy；RestartStrategy定义了canRestart及restart两个方法，它有NoRestartStrategy、FixedDelayRestartStrategy、FailureRateRestartStrategy这几个子类docRestart Strategies ...

聊聊flink的CsvTableSource

序本文主要研究一下flink的CsvTableSourceTableSourceflink-table_2.11-1.7.1-sources.jar!/org/apache/flink/table/sources/TableSource.scalatrait TableSource[T] { /** Returns the [[TypeInformation]] for the return type of the [[TableSource]]. * The fields of the return type are mapped to the table schema based on their name. * * @return The type of the returned [[DataSet]] or [[DataStream]]. / def getReturnType: TypeInformation[T] /* * Returns the schema of the produced table. * * @return The [[TableSchema]] of the produced table. / def getTableSchema: TableSchema /* * Describes the table source. * * @return A String explaining the [[TableSource]]. / def explainSource(): String = TableConnectorUtil.generateRuntimeName(getClass, getTableSchema.getFieldNames)}TableSource定义了三个方法，分别是getReturnType、getTableSchema、explainSourceBatchTableSourceflink-table_2.11-1.7.1-sources.jar!/org/apache/flink/table/sources/BatchTableSource.scalatrait BatchTableSource[T] extends TableSource[T] { /* * Returns the data of the table as a [[DataSet]]. * * NOTE: This method is for internal use only for defining a [[TableSource]]. * Do not use it in Table API programs. / def getDataSet(execEnv: ExecutionEnvironment): DataSet[T]}BatchTableSource继承了TableSource，它定义了getDataSet方法StreamTableSourceflink-table_2.11-1.7.1-sources.jar!/org/apache/flink/table/sources/StreamTableSource.scalatrait StreamTableSource[T] extends TableSource[T] { /* * Returns the data of the table as a [[DataStream]]. * * NOTE: This method is for internal use only for defining a [[TableSource]]. * Do not use it in Table API programs. / def getDataStream(execEnv: StreamExecutionEnvironment): DataStream[T]}StreamTableSource继承了TableSource，它定义了getDataStream方法CsvTableSourceflink-table_2.11-1.7.1-sources.jar!/org/apache/flink/table/sources/CsvTableSource.scalaclass CsvTableSource private ( private val path: String, private val fieldNames: Array[String], private val fieldTypes: Array[TypeInformation[]], private val selectedFields: Array[Int], private val fieldDelim: String, private val rowDelim: String, private val quoteCharacter: Character, private val ignoreFirstLine: Boolean, private val ignoreComments: String, private val lenient: Boolean) extends BatchTableSource[Row] with StreamTableSource[Row] with ProjectableTableSource[Row] { def this( path: String, fieldNames: Array[String], fieldTypes: Array[TypeInformation[]], fieldDelim: String = CsvInputFormat.DEFAULT_FIELD_DELIMITER, rowDelim: String = CsvInputFormat.DEFAULT_LINE_DELIMITER, quoteCharacter: Character = null, ignoreFirstLine: Boolean = false, ignoreComments: String = null, lenient: Boolean = false) = { this( path, fieldNames, fieldTypes, fieldTypes.indices.toArray, // initially, all fields are returned fieldDelim, rowDelim, quoteCharacter, ignoreFirstLine, ignoreComments, lenient) } def this(path: String, fieldNames: Array[String], fieldTypes: Array[TypeInformation[]]) = { this(path, fieldNames, fieldTypes, CsvInputFormat.DEFAULT_FIELD_DELIMITER, CsvInputFormat.DEFAULT_LINE_DELIMITER, null, false, null, false) } if (fieldNames.length != fieldTypes.length) { throw new TableException(“Number of field names and field types must be equal.”) } private val selectedFieldTypes = selectedFields.map(fieldTypes()) private val selectedFieldNames = selectedFields.map(fieldNames(_)) private val returnType: RowTypeInfo = new RowTypeInfo(selectedFieldTypes, selectedFieldNames) override def getDataSet(execEnv: ExecutionEnvironment): DataSet[Row] = { execEnv.createInput(createCsvInput(), returnType).name(explainSource()) } /* Returns the [[RowTypeInfo]] for the return type of the [[CsvTableSource]]. / override def getReturnType: RowTypeInfo = returnType override def getDataStream(streamExecEnv: StreamExecutionEnvironment): DataStream[Row] = { streamExecEnv.createInput(createCsvInput(), returnType).name(explainSource()) } /* Returns the schema of the produced table. / override def getTableSchema = new TableSchema(fieldNames, fieldTypes) /* Returns a copy of [[TableSource]] with ability to project fields */ override def projectFields(fields: Array[Int]): CsvTableSource = { val selectedFields = if (fields.isEmpty) Array(0) else fields new CsvTableSource( path, fieldNames, fieldTypes, selectedFields, fieldDelim, rowDelim, quoteCharacter, ignoreFirstLine, ignoreComments, lenient) } private def createCsvInput(): RowCsvInputFormat = { val inputFormat = new RowCsvInputFormat( new Path(path), selectedFieldTypes, rowDelim, fieldDelim, selectedFields) inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine) inputFormat.setLenient(lenient) if (quoteCharacter != null) { inputFormat.enableQuotedStringParsing(quoteCharacter) } if (ignoreComments != null) { inputFormat.setCommentPrefix(ignoreComments) } inputFormat } override def equals(other: Any): Boolean = other match { case that: CsvTableSource => returnType == that.returnType && path == that.path && fieldDelim == that.fieldDelim && rowDelim == that.rowDelim && quoteCharacter == that.quoteCharacter && ignoreFirstLine == that.ignoreFirstLine && ignoreComments == that.ignoreComments && lenient == that.lenient case _ => false } override def hashCode(): Int = { returnType.hashCode() } override def explainSource(): String = { s"CsvTableSource(" + s"read fields: ${getReturnType.getFieldNames.mkString(", “)})” }}CsvTableSource同时实现了BatchTableSource及StreamTableSource接口；getDataSet方法使用ExecutionEnvironment.createInput创建DataSet；getDataStream方法使用StreamExecutionEnvironment.createInput创建DataStreamExecutionEnvironment.createInput及StreamExecutionEnvironment.createInput接收的InputFormat为RowCsvInputFormat，通过createCsvInput创建而来getTableSchema方法返回的TableSchema通过fieldNames及fieldTypes创建；getReturnType方法返回的RowTypeInfo通过selectedFieldTypes及selectedFieldNames创建；explainSource方法这里返回的是CsvTableSource开头的字符串小结TableSource定义了三个方法，分别是getReturnType、getTableSchema、explainSource；BatchTableSource继承了TableSource，它定义了getDataSet方法；StreamTableSource继承了TableSource，它定义了getDataStream方法CsvTableSource同时实现了BatchTableSource及StreamTableSource接口；getDataSet方法使用ExecutionEnvironment.createInput创建DataSet；getDataStream方法使用StreamExecutionEnvironment.createInput创建DataStreamExecutionEnvironment.createInput及StreamExecutionEnvironment.createInput接收的InputFormat为RowCsvInputFormat，通过createCsvInput创建而来；getTableSchema方法返回的TableSchema通过fieldNames及fieldTypes创建；getReturnType方法返回的RowTypeInfo通过selectedFieldTypes及selectedFieldNames创建；explainSource方法这里返回的是CsvTableSource开头的字符串docDefine a TableSource ...

聊聊flink的Table Formats

序本文主要研究一下flink的Table Formats实例CSV Format.withFormat( new Csv() .field(“field1”, Types.STRING) // required: ordered format fields .field(“field2”, Types.TIMESTAMP) .fieldDelimiter(",") // optional: string delimiter “,” by default .lineDelimiter("\n") // optional: string delimiter “\n” by default .quoteCharacter(’"’) // optional: single character for string values, empty by default .commentPrefix(’#’) // optional: string to indicate comments, empty by default .ignoreFirstLine() // optional: ignore the first line, by default it is not skipped .ignoreParseErrors() // optional: skip records with parse error instead of failing by default)flink内置支持csv format，无需添加额外依赖JSON Format.withFormat( new Json() .failOnMissingField(true) // optional: flag whether to fail if a field is missing or not, false by default // required: define the schema either by using type information which parses numbers to corresponding types .schema(Type.ROW(…)) // or by using a JSON schema which parses to DECIMAL and TIMESTAMP .jsonSchema( “{” + " type: ‘object’," + " properties: {" + " lon: {" + " type: ’number’" + " }," + " rideTime: {" + " type: ‘string’," + " format: ‘date-time’" + " }" + " }" + “}” ) // or use the table’s schema .deriveSchema())可以使用schema或者jsonSchema或者deriveSchema来定义json format，需要额外添加flink-json依赖Apache Avro Format.withFormat( new Avro() // required: define the schema either by using an Avro specific record class .recordClass(User.class) // or by using an Avro schema .avroSchema( “{” + " "type": "record"," + " "name": "test"," + " "fields" : [" + " {"name": "a", "type": "long"}," + " {"name": "b", "type": "string"}" + " ]" + “}” ))可以使用recordClass或者avroSchema来定义Avro schema，需要添加flink-avro依赖ConnectTableDescriptorflink-table_2.11-1.7.1-sources.jar!/org/apache/flink/table/descriptors/ConnectTableDescriptor.scalaabstract class ConnectTableDescriptor[D <: ConnectTableDescriptor[D]]( private val tableEnv: TableEnvironment, private val connectorDescriptor: ConnectorDescriptor) extends TableDescriptor with SchematicDescriptor[D] with RegistrableDescriptor { this: D => private var formatDescriptor: Option[FormatDescriptor] = None private var schemaDescriptor: Option[Schema] = None //…… override def withFormat(format: FormatDescriptor): D = { formatDescriptor = Some(format) this } //……}StreamTableEnvironment的connect方法创建StreamTableDescriptor；StreamTableDescriptor继承了ConnectTableDescriptor；ConnectTableDescriptor提供了withFormat方法，返回FormatDescriptorFormatDescriptorflink-table-common-1.7.1-sources.jar!/org/apache/flink/table/descriptors/FormatDescriptor.java@PublicEvolvingpublic abstract class FormatDescriptor extends DescriptorBase implements Descriptor { private String type; private int version; /** * Constructs a {@link FormatDescriptor}. * * @param type string that identifies this format * @param version property version for backwards compatibility / public FormatDescriptor(String type, int version) { this.type = type; this.version = version; } @Override public final Map<String, String> toProperties() { final DescriptorProperties properties = new DescriptorProperties(); properties.putString(FormatDescriptorValidator.FORMAT_TYPE, type); properties.putInt(FormatDescriptorValidator.FORMAT_PROPERTY_VERSION, version); properties.putProperties(toFormatProperties()); return properties.asMap(); } /* * Converts this descriptor into a set of format properties. Usually prefixed with * {@link FormatDescriptorValidator#FORMAT}. */ protected abstract Map<String, String> toFormatProperties();}FormatDescriptor是个抽象类，Csv、Json、Avro都是它的子类Csvflink-table_2.11-1.7.1-sources.jar!/org/apache/flink/table/descriptors/Csv.scalaclass Csv extends FormatDescriptor(FORMAT_TYPE_VALUE, 1) { private var fieldDelim: Option[String] = None private var lineDelim: Option[String] = None private val schema: mutable.LinkedHashMap[String, String] = mutable.LinkedHashMapString, String private var quoteCharacter: Option[Character] = None private var commentPrefix: Option[String] = None private var isIgnoreFirstLine: Option[Boolean] = None private var lenient: Option[Boolean] = None def fieldDelimiter(delim: String): Csv = { this.fieldDelim = Some(delim) this } def lineDelimiter(delim: String): Csv = { this.lineDelim = Some(delim) this } def schema(schema: TableSchema): Csv = { this.schema.clear() schema.getFieldNames.zip(schema.getFieldTypes).foreach { case (n, t) => field(n, t) } this } def field(fieldName: String, fieldType: TypeInformation[_]): Csv = { field(fieldName, TypeStringUtils.writeTypeInfo(fieldType)) this } def field(fieldName: String, fieldType: String): Csv = { if (schema.contains(fieldName)) { throw new ValidationException(s"Duplicate field name $fieldName.") } schema += (fieldName -> fieldType) this } def quoteCharacter(quote: Character): Csv = { this.quoteCharacter = Option(quote) this } def commentPrefix(prefix: String): Csv = { this.commentPrefix = Option(prefix) this } def ignoreFirstLine(): Csv = { this.isIgnoreFirstLine = Some(true) this } def ignoreParseErrors(): Csv = { this.lenient = Some(true) this } override protected def toFormatProperties: util.Map[String, String] = { val properties = new DescriptorProperties() fieldDelim.foreach(properties.putString(FORMAT_FIELD_DELIMITER, _)) lineDelim.foreach(properties.putString(FORMAT_LINE_DELIMITER, _)) val subKeys = util.Arrays.asList( DescriptorProperties.TABLE_SCHEMA_NAME, DescriptorProperties.TABLE_SCHEMA_TYPE) val subValues = schema.map(e => util.Arrays.asList(e._1, e._2)).toList.asJava properties.putIndexedFixedProperties( FORMAT_FIELDS, subKeys, subValues) quoteCharacter.foreach(properties.putCharacter(FORMAT_QUOTE_CHARACTER, _)) commentPrefix.foreach(properties.putString(FORMAT_COMMENT_PREFIX, _)) isIgnoreFirstLine.foreach(properties.putBoolean(FORMAT_IGNORE_FIRST_LINE, _)) lenient.foreach(properties.putBoolean(FORMAT_IGNORE_PARSE_ERRORS, _)) properties.asMap() }}Csv提供了field、fieldDelimiter、lineDelimiter、quoteCharacter、commentPrefix、ignoreFirstLine、ignoreParseErrors等方法Jsonflink-json-1.7.1-sources.jar!/org/apache/flink/table/descriptors/Json.javapublic class Json extends FormatDescriptor { private Boolean failOnMissingField; private Boolean deriveSchema; private String jsonSchema; private String schema; public Json() { super(FORMAT_TYPE_VALUE, 1); } public Json failOnMissingField(boolean failOnMissingField) { this.failOnMissingField = failOnMissingField; return this; } public Json jsonSchema(String jsonSchema) { Preconditions.checkNotNull(jsonSchema); this.jsonSchema = jsonSchema; this.schema = null; this.deriveSchema = null; return this; } public Json schema(TypeInformation<Row> schemaType) { Preconditions.checkNotNull(schemaType); this.schema = TypeStringUtils.writeTypeInfo(schemaType); this.jsonSchema = null; this.deriveSchema = null; return this; } public Json deriveSchema() { this.deriveSchema = true; this.schema = null; this.jsonSchema = null; return this; } @Override protected Map<String, String> toFormatProperties() { final DescriptorProperties properties = new DescriptorProperties(); if (deriveSchema != null) { properties.putBoolean(FORMAT_DERIVE_SCHEMA, deriveSchema); } if (jsonSchema != null) { properties.putString(FORMAT_JSON_SCHEMA, jsonSchema); } if (schema != null) { properties.putString(FORMAT_SCHEMA, schema); } if (failOnMissingField != null) { properties.putBoolean(FORMAT_FAIL_ON_MISSING_FIELD, failOnMissingField); } return properties.asMap(); }}Json提供了schema、jsonSchema、deriveSchema三种方式来定义json formatAvroflink-avro-1.7.1-sources.jar!/org/apache/flink/table/descriptors/Avro.javapublic class Avro extends FormatDescriptor { private Class<? extends SpecificRecord> recordClass; private String avroSchema; public Avro() { super(AvroValidator.FORMAT_TYPE_VALUE, 1); } public Avro recordClass(Class<? extends SpecificRecord> recordClass) { Preconditions.checkNotNull(recordClass); this.recordClass = recordClass; return this; } public Avro avroSchema(String avroSchema) { Preconditions.checkNotNull(avroSchema); this.avroSchema = avroSchema; return this; } @Override protected Map<String, String> toFormatProperties() { final DescriptorProperties properties = new DescriptorProperties(); if (null != recordClass) { properties.putClass(AvroValidator.FORMAT_RECORD_CLASS, recordClass); } if (null != avroSchema) { properties.putString(AvroValidator.FORMAT_AVRO_SCHEMA, avroSchema); } return properties.asMap(); }}Avro提供了recordClass、avroSchema两种方式来定义avro format小结StreamTableEnvironment的connect方法创建StreamTableDescriptor；StreamTableDescriptor继承了ConnectTableDescriptorConnectTableDescriptor提供了withFormat方法，返回FormatDescriptor；FormatDescriptor是个抽象类，Csv、Json、Avro都是它的子类Csv提供了field、fieldDelimiter、lineDelimiter、quoteCharacter、commentPrefix、ignoreFirstLine、ignoreParseErrors等方法；Json提供了schema、jsonSchema、deriveSchema三种方式来定义json format；Avro提供了recordClass、avroSchema两种方式来定义avro formatdocTable Formats ...

聊聊flink Table Schema的定义

序本文主要研究一下flink Table Schema的定义实例定义字段及类型.withSchema( new Schema() .field(“MyField1”, Types.INT) // required: specify the fields of the table (in this order) .field(“MyField2”, Types.STRING) .field(“MyField3”, Types.BOOLEAN))通过field定义字段名及字段类型定义字段属性.withSchema( new Schema() .field(“MyField1”, Types.SQL_TIMESTAMP) .proctime() // optional: declares this field as a processing-time attribute .field(“MyField2”, Types.SQL_TIMESTAMP) .rowtime(…) // optional: declares this field as a event-time attribute .field(“MyField3”, Types.BOOLEAN) .from(“mf3”) // optional: original field in the input that is referenced/aliased by this field)通过proctime定义processing-time，通过rowtime定义event-time，通过from定义引用或别名定义Rowtime属性// Converts an existing LONG or SQL_TIMESTAMP field in the input into the rowtime attribute..rowtime( new Rowtime() .timestampsFromField(“ts_field”) // required: original field name in the input)// Converts the assigned timestamps from a DataStream API record into the rowtime attribute// and thus preserves the assigned timestamps from the source.// This requires a source that assigns timestamps (e.g., Kafka 0.10+)..rowtime( new Rowtime() .timestampsFromSource())// Sets a custom timestamp extractor to be used for the rowtime attribute.// The extractor must extend org.apache.flink.table.sources.tsextractors.TimestampExtractor..rowtime( new Rowtime() .timestampsFromExtractor(…))通过timestampsFromField、timestampsFromSource、timestampsFromExtractor定义rowtime定义watermark strategies// Sets a watermark strategy for ascending rowtime attributes. Emits a watermark of the maximum// observed timestamp so far minus 1. Rows that have a timestamp equal to the max timestamp// are not late..rowtime( new Rowtime() .watermarksPeriodicAscending())// Sets a built-in watermark strategy for rowtime attributes which are out-of-order by a bounded time interval.// Emits watermarks which are the maximum observed timestamp minus the specified delay..rowtime( new Rowtime() .watermarksPeriodicBounded(2000) // delay in milliseconds)// Sets a built-in watermark strategy which indicates the watermarks should be preserved from the// underlying DataStream API and thus preserves the assigned watermarks from the source..rowtime( new Rowtime() .watermarksFromSource())通过watermarksPeriodicAscending、watermarksPeriodicBounded、watermarksFromSource定义watermark strategiesStreamTableEnvironment.connectflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/StreamTableEnvironment.scalaabstract class StreamTableEnvironment( private[flink] val execEnv: StreamExecutionEnvironment, config: TableConfig) extends TableEnvironment(config) { //…… def connect(connectorDescriptor: ConnectorDescriptor): StreamTableDescriptor = { new StreamTableDescriptor(this, connectorDescriptor) } //……}StreamTableEnvironment的connect方法创建StreamTableDescriptorStreamTableDescriptorflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/descriptors/StreamTableDescriptor.scalaclass StreamTableDescriptor( tableEnv: StreamTableEnvironment, connectorDescriptor: ConnectorDescriptor) extends ConnectTableDescriptor[StreamTableDescriptor]( tableEnv, connectorDescriptor) with StreamableDescriptor[StreamTableDescriptor] { //……}StreamTableDescriptor继承了ConnectTableDescriptorConnectTableDescriptorflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/descriptors/ConnectTableDescriptor.scalaabstract class ConnectTableDescriptor[D <: ConnectTableDescriptor[D]]( private val tableEnv: TableEnvironment, private val connectorDescriptor: ConnectorDescriptor) extends TableDescriptor with SchematicDescriptor[D] with RegistrableDescriptor { this: D => private var formatDescriptor: Option[FormatDescriptor] = None private var schemaDescriptor: Option[Schema] = None /** * Searches for the specified table source, configures it accordingly, and registers it as * a table under the given name. * * @param name table name to be registered in the table environment / override def registerTableSource(name: String): Unit = { val tableSource = TableFactoryUtil.findAndCreateTableSource(tableEnv, this) tableEnv.registerTableSource(name, tableSource) } /* * Searches for the specified table sink, configures it accordingly, and registers it as * a table under the given name. * * @param name table name to be registered in the table environment / override def registerTableSink(name: String): Unit = { val tableSink = TableFactoryUtil.findAndCreateTableSink(tableEnv, this) tableEnv.registerTableSink(name, tableSink) } /* * Searches for the specified table source and sink, configures them accordingly, and registers * them as a table under the given name. * * @param name table name to be registered in the table environment / override def registerTableSourceAndSink(name: String): Unit = { registerTableSource(name) registerTableSink(name) } /* * Specifies the format that defines how to read data from a connector. / override def withFormat(format: FormatDescriptor): D = { formatDescriptor = Some(format) this } /* * Specifies the resulting table schema. / override def withSchema(schema: Schema): D = { schemaDescriptor = Some(schema) this } // ———————————————————————————————- /* * Converts this descriptor into a set of properties. */ override def toProperties: util.Map[String, String] = { val properties = new DescriptorProperties() // this performs only basic validation // more validation can only happen within a factory if (connectorDescriptor.isFormatNeeded && formatDescriptor.isEmpty) { throw new ValidationException( s"The connector ‘$connectorDescriptor’ requires a format description.") } else if (!connectorDescriptor.isFormatNeeded && formatDescriptor.isDefined) { throw new ValidationException( s"The connector ‘$connectorDescriptor’ does not require a format description " + s"but ‘${formatDescriptor.get}’ found.") } properties.putProperties(connectorDescriptor.toProperties) formatDescriptor.foreach(d => properties.putProperties(d.toProperties)) schemaDescriptor.foreach(d => properties.putProperties(d.toProperties)) properties.asMap() }}ConnectTableDescriptor提供了withSchema方法，返回SchemaSchemaflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/descriptors/Schema.scalaclass Schema extends Descriptor { // maps a field name to a list of properties that describe type, origin, and the time attribute private val tableSchema = mutable.LinkedHashMapString, mutable.LinkedHashMap[String, String] private var lastField: Option[String] = None def schema(schema: TableSchema): Schema = { tableSchema.clear() lastField = None schema.getFieldNames.zip(schema.getFieldTypes).foreach { case (n, t) => field(n, t) } this } def field(fieldName: String, fieldType: TypeInformation[]): Schema = { field(fieldName, TypeStringUtils.writeTypeInfo(fieldType)) this } def field(fieldName: String, fieldType: String): Schema = { if (tableSchema.contains(fieldName)) { throw new ValidationException(s"Duplicate field name $fieldName.") } val fieldProperties = mutable.LinkedHashMapString, String fieldProperties += (SCHEMA_TYPE -> fieldType) tableSchema += (fieldName -> fieldProperties) lastField = Some(fieldName) this } def from(originFieldName: String): Schema = { lastField match { case None => throw new ValidationException(“No field previously defined. Use field() before.”) case Some(f) => tableSchema(f) += (SCHEMA_FROM -> originFieldName) lastField = None } this } def proctime(): Schema = { lastField match { case None => throw new ValidationException(“No field defined previously. Use field() before.”) case Some(f) => tableSchema(f) += (SCHEMA_PROCTIME -> “true”) lastField = None } this } def rowtime(rowtime: Rowtime): Schema = { lastField match { case None => throw new ValidationException(“No field defined previously. Use field() before.”) case Some(f) => tableSchema(f) ++= rowtime.toProperties.asScala lastField = None } this } final override def toProperties: util.Map[String, String] = { val properties = new DescriptorProperties() properties.putIndexedVariableProperties( SCHEMA, tableSchema.toSeq.map { case (name, props) => (Map(SCHEMA_NAME -> name) ++ props).asJava }.asJava ) properties.asMap() }}Schem提供了field、from、proctime、rowtime方法用于定义Schema的相关属性Rowtimeflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/descriptors/Rowtime.scalaclass Rowtime extends Descriptor { private var timestampExtractor: Option[TimestampExtractor] = None private var watermarkStrategy: Option[WatermarkStrategy] = None def timestampsFromField(fieldName: String): Rowtime = { timestampExtractor = Some(new ExistingField(fieldName)) this } def timestampsFromSource(): Rowtime = { timestampExtractor = Some(new StreamRecordTimestamp) this } def timestampsFromExtractor(extractor: TimestampExtractor): Rowtime = { timestampExtractor = Some(extractor) this } def watermarksPeriodicAscending(): Rowtime = { watermarkStrategy = Some(new AscendingTimestamps) this } def watermarksPeriodicBounded(delay: Long): Rowtime = { watermarkStrategy = Some(new BoundedOutOfOrderTimestamps(delay)) this } def watermarksFromSource(): Rowtime = { watermarkStrategy = Some(PreserveWatermarks.INSTANCE) this } def watermarksFromStrategy(strategy: WatermarkStrategy): Rowtime = { watermarkStrategy = Some(strategy) this } final override def toProperties: java.util.Map[String, String] = { val properties = new DescriptorProperties() timestampExtractor.foreach(normalizeTimestampExtractor() .foreach(e => properties.putString(e._1, e.2))) watermarkStrategy.foreach(normalizeWatermarkStrategy() .foreach(e => properties.putString(e._1, e._2))) properties.asMap() }}Rowtime提供了timestampsFromField、timestampsFromSource、timestampsFromExtractor方法用于定义timestamps；提供了watermarksPeriodicAscending、watermarksPeriodicBounded、watermarksFromSource、watermarksFromStrategy方法用于定义watermark strategies小结StreamTableEnvironment的connect方法创建StreamTableDescriptor；StreamTableDescriptor继承了ConnectTableDescriptor；ConnectTableDescriptor提供了withSchema方法，返回SchemaSchem提供了field、from、proctime、rowtime方法用于定义Schema的相关属性；通过proctime定义processing-time，通过rowtime定义event-time，通过from定义引用或别名Rowtime提供了timestampsFromField、timestampsFromSource、timestampsFromExtractor方法用于定义timestamps；提供了watermarksPeriodicAscending、watermarksPeriodicBounded、watermarksFromSource、watermarksFromStrategy方法用于定义watermark strategiesdocTable Schema ...

聊聊flink的JDBCAppendTableSink

序本文主要研究一下flink的JDBCAppendTableSink实例JDBCAppendTableSink sink = JDBCAppendTableSink.builder() .setDrivername(“org.apache.derby.jdbc.EmbeddedDriver”) .setDBUrl(“jdbc:derby:memory:ebookshop”) .setQuery(“INSERT INTO books (id) VALUES (?)”) .setParameterTypes(INT_TYPE_INFO) .build();tableEnv.registerTableSink( “jdbcOutputTable”, // specify table schema new String[]{“id”}, new TypeInformation[]{Types.INT}, sink);Table table = …table.insertInto(“jdbcOutputTable”);这里使用tableEnv.registerTableSink注册JDBCAppendTableSink，之后利用table.insertInto往该sink写数据JDBCAppendTableSinkflink-jdbc_2.11-1.7.0-sources.jar!/org/apache/flink/api/java/io/jdbc/JDBCAppendTableSink.javapublic class JDBCAppendTableSink implements AppendStreamTableSink<Row>, BatchTableSink<Row> { private final JDBCOutputFormat outputFormat; private String[] fieldNames; private TypeInformation[] fieldTypes; JDBCAppendTableSink(JDBCOutputFormat outputFormat) { this.outputFormat = outputFormat; } public static JDBCAppendTableSinkBuilder builder() { return new JDBCAppendTableSinkBuilder(); } @Override public void emitDataStream(DataStream<Row> dataStream) { dataStream .addSink(new JDBCSinkFunction(outputFormat)) .name(TableConnectorUtil.generateRuntimeName(this.getClass(), fieldNames)); } @Override public void emitDataSet(DataSet<Row> dataSet) { dataSet.output(outputFormat); } @Override public TypeInformation<Row> getOutputType() { return new RowTypeInfo(fieldTypes, fieldNames); } @Override public String[] getFieldNames() { return fieldNames; } @Override public TypeInformation<?>[] getFieldTypes() { return fieldTypes; } @Override public TableSink<Row> configure(String[] fieldNames, TypeInformation<?>[] fieldTypes) { int[] types = outputFormat.getTypesArray(); String sinkSchema = String.join(", “, IntStream.of(types).mapToObj(JDBCTypeUtil::getTypeName).collect(Collectors.toList())); String tableSchema = String.join(”, “, Stream.of(fieldTypes).map(JDBCTypeUtil::getTypeName).collect(Collectors.toList())); String msg = String.format(“Schema of output table is incompatible with JDBCAppendTableSink schema. " + “Table schema: [%s], sink schema: [%s]”, tableSchema, sinkSchema); Preconditions.checkArgument(fieldTypes.length == types.length, msg); for (int i = 0; i < types.length; ++i) { Preconditions.checkArgument( JDBCTypeUtil.typeInformationToSqlType(fieldTypes[i]) == types[i], msg); } JDBCAppendTableSink copy; try { copy = new JDBCAppendTableSink(InstantiationUtil.clone(outputFormat)); } catch (IOException | ClassNotFoundException e) { throw new RuntimeException(e); } copy.fieldNames = fieldNames; copy.fieldTypes = fieldTypes; return copy; } @VisibleForTesting JDBCOutputFormat getOutputFormat() { return outputFormat; }}JDBCAppendTableSink实现了AppendStreamTableSink接口的emitDataStream方法以及BatchTableSink接口的emitDataSet方法；AppendStreamTableSink接口及BatchTableSink接口都继承自TableSink接口，该接口定义了getOutputType、getFieldNames、getFieldTypes、configure方法emitDataStream方法通过JDBCOutputFormat创建JDBCSinkFunction，然后输出到dataStream；emitDataSet方法则直接通过dataSet的output方法采用JDBCOutputFormat输出JDBCAppendTableSink提供了builder静态方法用于创建JDBCAppendTableSinkBuilder，可以用来构建JDBCAppendTableSinkJDBCAppendTableSinkBuilderflink-jdbc_2.11-1.7.0-sources.jar!/org/apache/flink/api/java/io/jdbc/JDBCAppendTableSinkBuilder.javapublic class JDBCAppendTableSinkBuilder { private String username; private String password; private String driverName; private String dbURL; private String query; private int batchSize = DEFAULT_BATCH_INTERVAL; private int[] parameterTypes; /** * Specify the username of the JDBC connection. * @param username the username of the JDBC connection. / public JDBCAppendTableSinkBuilder setUsername(String username) { this.username = username; return this; } /* * Specify the password of the JDBC connection. * @param password the password of the JDBC connection. / public JDBCAppendTableSinkBuilder setPassword(String password) { this.password = password; return this; } /* * Specify the name of the JDBC driver that will be used. * @param drivername the name of the JDBC driver. / public JDBCAppendTableSinkBuilder setDrivername(String drivername) { this.driverName = drivername; return this; } /* * Specify the URL of the JDBC database. * @param dbURL the URL of the database, whose format is specified by the * corresponding JDBC driver. / public JDBCAppendTableSinkBuilder setDBUrl(String dbURL) { this.dbURL = dbURL; return this; } /* * Specify the query that the sink will execute. Usually user can specify * INSERT, REPLACE or UPDATE to push the data to the database. * @param query The query to be executed by the sink. * @see org.apache.flink.api.java.io.jdbc.JDBCOutputFormat.JDBCOutputFormatBuilder#setQuery(String) / public JDBCAppendTableSinkBuilder setQuery(String query) { this.query = query; return this; } /* * Specify the size of the batch. By default the sink will batch the query * to improve the performance * @param batchSize the size of batch / public JDBCAppendTableSinkBuilder setBatchSize(int batchSize) { this.batchSize = batchSize; return this; } /* * Specify the type of the rows that the sink will be accepting. * @param types the type of each field / public JDBCAppendTableSinkBuilder setParameterTypes(TypeInformation<?>… types) { int[] ty = new int[types.length]; for (int i = 0; i < types.length; ++i) { ty[i] = JDBCTypeUtil.typeInformationToSqlType(types[i]); } this.parameterTypes = ty; return this; } /* * Specify the type of the rows that the sink will be accepting. * @param types the type of each field defined by {@see java.sql.Types}. / public JDBCAppendTableSinkBuilder setParameterTypes(int… types) { this.parameterTypes = types; return this; } /* * Finalizes the configuration and checks validity. * * @return Configured JDBCOutputFormat */ public JDBCAppendTableSink build() { Preconditions.checkNotNull(parameterTypes, “Types of the query parameters are not specified.” + " Please specify types using the setParameterTypes() method.”); JDBCOutputFormat format = JDBCOutputFormat.buildJDBCOutputFormat() .setUsername(username) .setPassword(password) .setDBUrl(dbURL) .setQuery(query) .setDrivername(driverName) .setBatchInterval(batchSize) .setSqlTypes(parameterTypes) .finish(); return new JDBCAppendTableSink(format); }}JDBCAppendTableSinkBuilder提供了setUsername、setPassword、setDrivername、setDBUrl、setQuery、setBatchSize、setParameterTypes方法用于设置构建JDBCOutputFormat的对应属性，最后build方法使用JDBCOutputFormat创建了JDBCAppendTableSink小结JDBCAppendTableSink在开启checkpoint的情况下，它实现的是at-least-once的语义，如果要实现exactly-once的语义，则需要使用类似REPLACE或者INSERT OVERWRITE这类幂等的操作；JDBCAppendTableSink实现了AppendStreamTableSink接口的emitDataStream方法以及BatchTableSink接口的emitDataSet方法AppendStreamTableSink接口及BatchTableSink接口都继承自TableSink接口，该接口定义了getOutputType、getFieldNames、getFieldTypes、configure方法；emitDataStream方法通过JDBCOutputFormat创建JDBCSinkFunction，然后输出到dataStream；emitDataSet方法则直接通过dataSet的output方法采用JDBCOutputFormat输出；JDBCAppendTableSink提供了builder静态方法用于创建JDBCAppendTableSinkBuilder，可以用来构建JDBCAppendTableSinkJDBCAppendTableSinkBuilder提供了setUsername、setPassword、setDrivername、setDBUrl、setQuery、setBatchSize、setParameterTypes方法用于设置构建JDBCOutputFormat的对应属性，最后build方法使用JDBCOutputFormat创建了JDBCAppendTableSinkdocJDBCAppendTableSinkJDBCOutputFormat聊聊flink的JDBCOutputFormat ...

聊聊flink Table的Time Attributes

序本文主要研究一下flink Table的Time AttributesProcessing time通过fromDataStream定义DataStream<Tuple2<String, String>> stream = …;// declare an additional logical field as a processing time attributeTable table = tEnv.fromDataStream(stream, “Username, Data, UserActionTime.proctime”);WindowedTable windowedTable = table.window(Tumble.over(“10.minutes”).on(“UserActionTime”).as(“userActionWindow”));从DataStream创建Table的话，可以在fromDataStream里头进行定义Processing time通过TableSource定义// define a table source with a processing attributepublic class UserActionSource implements StreamTableSource<Row>, DefinedProctimeAttribute { @Override public TypeInformation<Row> getReturnType() { String[] names = new String[] {“Username” , “Data”}; TypeInformation[] types = new TypeInformation[] {Types.STRING(), Types.STRING()}; return Types.ROW(names, types); } @Override public DataStream<Row> getDataStream(StreamExecutionEnvironment execEnv) { // create stream DataStream<Row> stream = …; return stream; } @Override public String getProctimeAttribute() { // field with this name will be appended as a third field return “UserActionTime”; }}// register table sourcetEnv.registerTableSource(“UserActions”, new UserActionSource());WindowedTable windowedTable = tEnv .scan(“UserActions”) .window(Tumble.over(“10.minutes”).on(“UserActionTime”).as(“userActionWindow”));通过TableSource创建Table的话，可以通过实现DefinedProctimeAttribute接口来定义Processing timeEvent time通过fromDataStream定义// Option 1:// extract timestamp and assign watermarks based on knowledge of the streamDataStream<Tuple2<String, String>> stream = inputStream.assignTimestampsAndWatermarks(…);// declare an additional logical field as an event time attributeTable table = tEnv.fromDataStream(stream, “Username, Data, UserActionTime.rowtime”);// Option 2:// extract timestamp from first field, and assign watermarks based on knowledge of the streamDataStream<Tuple3<Long, String, String>> stream = inputStream.assignTimestampsAndWatermarks(…);// the first field has been used for timestamp extraction, and is no longer necessary// replace first field with a logical event time attributeTable table = tEnv.fromDataStream(stream, “UserActionTime.rowtime, Username, Data”);// Usage:WindowedTable windowedTable = table.window(Tumble.over(“10.minutes”).on(“UserActionTime”).as(“userActionWindow”));从DataStream创建Table的话，可以在fromDataStream里头进行定义Event time；具体有两种方式，一种是额外定义一个字段，一种是覆盖原有的字段通过TableSource定义// define a table source with a rowtime attributepublic class UserActionSource implements StreamTableSource<Row>, DefinedRowtimeAttributes { @Override public TypeInformation<Row> getReturnType() { String[] names = new String[] {“Username”, “Data”, “UserActionTime”}; TypeInformation[] types = new TypeInformation[] {Types.STRING(), Types.STRING(), Types.LONG()}; return Types.ROW(names, types); } @Override public DataStream<Row> getDataStream(StreamExecutionEnvironment execEnv) { // create stream // … // assign watermarks based on the “UserActionTime” attribute DataStream<Row> stream = inputStream.assignTimestampsAndWatermarks(…); return stream; } @Override public List<RowtimeAttributeDescriptor> getRowtimeAttributeDescriptors() { // Mark the “UserActionTime” attribute as event-time attribute. // We create one attribute descriptor of “UserActionTime”. RowtimeAttributeDescriptor rowtimeAttrDescr = new RowtimeAttributeDescriptor( “UserActionTime”, new ExistingField(“UserActionTime”), new AscendingTimestamps()); List<RowtimeAttributeDescriptor> listRowtimeAttrDescr = Collections.singletonList(rowtimeAttrDescr); return listRowtimeAttrDescr; }}// register the table sourcetEnv.registerTableSource(“UserActions”, new UserActionSource());WindowedTable windowedTable = tEnv .scan(“UserActions”) .window(Tumble.over(“10.minutes”).on(“UserActionTime”).as(“userActionWindow”));通过TableSource创建Table的话，可以通过实现DefinedRowtimeAttributes接口来定义Event timedefinedTimeAttributesflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/sources/definedTimeAttributes.scala/** * Extends a [[TableSource]] to specify a processing time attribute. /trait DefinedProctimeAttribute { /* * Returns the name of a processing time attribute or null if no processing time attribute is * present. * * The referenced attribute must be present in the [[TableSchema]] of the [[TableSource]] and of * type [[Types.SQL_TIMESTAMP]]. / @Nullable def getProctimeAttribute: String}/* * Extends a [[TableSource]] to specify rowtime attributes via a * [[RowtimeAttributeDescriptor]]. /trait DefinedRowtimeAttributes { /* * Returns a list of [[RowtimeAttributeDescriptor]] for all rowtime attributes of the table. * * All referenced attributes must be present in the [[TableSchema]] of the [[TableSource]] and of * type [[Types.SQL_TIMESTAMP]]. * * @return A list of [[RowtimeAttributeDescriptor]]. / def getRowtimeAttributeDescriptors: util.List[RowtimeAttributeDescriptor]}/* * Describes a rowtime attribute of a [[TableSource]]. * * @param attributeName The name of the rowtime attribute. * @param timestampExtractor The timestamp extractor to derive the values of the attribute. * @param watermarkStrategy The watermark strategy associated with the attribute. /class RowtimeAttributeDescriptor( val attributeName: String, val timestampExtractor: TimestampExtractor, val watermarkStrategy: WatermarkStrategy) { /* Returns the name of the rowtime attribute. / def getAttributeName: String = attributeName /* Returns the [[TimestampExtractor]] for the attribute. / def getTimestampExtractor: TimestampExtractor = timestampExtractor /* Returns the [[WatermarkStrategy]] for the attribute. */ def getWatermarkStrategy: WatermarkStrategy = watermarkStrategy override def equals(other: Any): Boolean = other match { case that: RowtimeAttributeDescriptor => Objects.equals(attributeName, that.attributeName) && Objects.equals(timestampExtractor, that.timestampExtractor) && Objects.equals(watermarkStrategy, that.watermarkStrategy) case _ => false } override def hashCode(): Int = { Objects.hash(attributeName, timestampExtractor, watermarkStrategy) }}DefinedProctimeAttribute定义了getProctimeAttribute方法，返回String，用于定义Process time的字段名；DefinedRowtimeAttributes定义了getRowtimeAttributeDescriptors方法，返回的是RowtimeAttributeDescriptor的List，RowtimeAttributeDescriptor有3个属性，分别是attributeName、timestampExtractor及watermarkStrategy小结在从DataStream或者TableSource创建Table时可以指定Time Attributes，指定了之后就可以作为field来使用或者参与time-based的操作针对Processing time，如果从DataStream创建Table的话，可以在fromDataStream里头进行定义；通过TableSource创建Table的话，可以通过实现DefinedProctimeAttribute接口来定义Processing time；DefinedProctimeAttribute定义了getProctimeAttribute方法，返回String，用于定义Process time的字段名针对Event time，如果从DataStream创建Table的话，可以在fromDataStream里头进行定义；具体有两种方式，一种是额外定义一个字段，一种是覆盖原有的字段；通过TableSource创建Table的话，可以通过实现DefinedRowtimeAttributes接口来定义Event time；DefinedRowtimeAttributes定义了getRowtimeAttributeDescriptors方法，返回的是RowtimeAttributeDescriptor的List，RowtimeAttributeDescriptor有3个属性，分别是attributeName、timestampExtractor及watermarkStrategydocTime Attributes ...

聊聊flink Table的OrderBy及Limit

序本文主要研究一下flink Table的OrderBy及Limit实例Table in = tableEnv.fromDataSet(ds, “a, b, c”);Table result = in.orderBy(“a.asc”);Table in = tableEnv.fromDataSet(ds, “a, b, c”);// returns the first 5 records from the sorted resultTable result1 = in.orderBy(“a.asc”).fetch(5); // skips the first 3 records and returns all following records from the sorted resultTable result2 = in.orderBy(“a.asc”).offset(3);// skips the first 10 records and returns the next 5 records from the sorted resultTable result3 = in.orderBy(“a.asc”).offset(10).fetch(5);orderBy方法类似sql的order by；limit则由offset及fetch两个方法构成，类似sql的offset及fetchTableflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass Table( private[flink] val tableEnv: TableEnvironment, private[flink] val logicalPlan: LogicalNode) { //…… def orderBy(fields: String): Table = { val parsedFields = ExpressionParser.parseExpressionList(fields) orderBy(parsedFields: ) } def orderBy(fields: Expression): Table = { val order: Seq[Ordering] = fields.map { case o: Ordering => o case e => Asc(e) } new Table(tableEnv, Sort(order, logicalPlan).validate(tableEnv)) } def offset(offset: Int): Table = { new Table(tableEnv, Limit(offset, -1, logicalPlan).validate(tableEnv)) } def fetch(fetch: Int): Table = { if (fetch < 0) { throw new ValidationException(“FETCH count must be equal or larger than 0.”) } this.logicalPlan match { case Limit(o, -1, c) => // replace LIMIT without FETCH by LIMIT with FETCH new Table(tableEnv, Limit(o, fetch, c).validate(tableEnv)) case Limit(, , ) => throw new ValidationException(“FETCH is already defined.”) case _ => new Table(tableEnv, Limit(0, fetch, logicalPlan).validate(tableEnv)) } } //……}Table的orderBy方法，支持String或Expression类型的参数，其中String类型最终是转为Expression类型；orderBy方法最后使用Sort重新创建了Table；offset及fetch方法，使用Limit重新创建了Table(offset方法创建的Limit其fetch为-1；fetch方法如果之前没有指定offset则创建的Limit的offset为0)Sortflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Sort(order: Seq[Ordering], child: LogicalNode) extends UnaryNode { override def output: Seq[Attribute] = child.output override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { child.construct(relBuilder) relBuilder.sort(order.map(.toRexNode(relBuilder)).asJava) } override def validate(tableEnv: TableEnvironment): LogicalNode = { if (tableEnv.isInstanceOf[StreamTableEnvironment]) { failValidation(s"Sort on stream tables is currently not supported.") } super.validate(tableEnv) }}Sort继承了UnaryNode，它的构造器接收Set类型的Ordering，其construct方法使用relBuilder.sort来构建sort条件Orderingflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/expressions/ordering.scalaabstract class Ordering extends UnaryExpression { override private[flink] def validateInput(): ValidationResult = { if (!child.isInstanceOf[NamedExpression]) { ValidationFailure(s"Sort should only based on field reference") } else { ValidationSuccess } }}case class Asc(child: Expression) extends Ordering { override def toString: String = s"($child).asc" override private[flink] def toRexNode(implicit relBuilder: RelBuilder): RexNode = { child.toRexNode } override private[flink] def resultType: TypeInformation[] = child.resultType}case class Desc(child: Expression) extends Ordering { override def toString: String = s"($child).desc" override private[flink] def toRexNode(implicit relBuilder: RelBuilder): RexNode = { relBuilder.desc(child.toRexNode) } override private[flink] def resultType: TypeInformation[_] = child.resultType}Ordering是一个抽象类，它有Asc及Desc两个子类Limitflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Limit(offset: Int, fetch: Int = -1, child: LogicalNode) extends UnaryNode { override def output: Seq[Attribute] = child.output override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { child.construct(relBuilder) relBuilder.limit(offset, fetch) } override def validate(tableEnv: TableEnvironment): LogicalNode = { if (tableEnv.isInstanceOf[StreamTableEnvironment]) { failValidation(s"Limit on stream tables is currently not supported.") } if (!child.isInstanceOf[Sort]) { failValidation(s"Limit operator must be preceded by an OrderBy operator.") } if (offset < 0) { failValidation(s"Offset should be greater than or equal to zero.") } super.validate(tableEnv) }}Limit继承了UnaryNode，它的构造器接收offset及fetch参数，它的construct方法通过relBuilder.limit来设置offset及fetch小结Table的orderBy方法类似sql的order by；limit则由offset及fetch两个方法构成，类似sql的offset及fetchTable的orderBy方法，支持String或Expression类型的参数，其中String类型最终是转为Expression类型；orderBy方法最后使用Sort重新创建了Table；offset及fetch方法，使用Limit重新创建了Table(offset方法创建的Limit其fetch为-1；fetch方法如果之前没有指定offset则创建的Limit的offset为0)Sort继承了UnaryNode，它的构造器接收Set类型的Ordering，其construct方法使用relBuilder.sort来构建sort条件；Ordering是一个抽象类，它有Asc及Desc两个子类；Limit继承了UnaryNode，它的构造器接收offset及fetch参数，它的construct方法通过relBuilder.limit来设置offset及fetchdocOrderBy, Offset & Fetch ...

聊聊flink Table的Set Operations

序本文主要研究一下flink Table的Set Operations实例UnionTable left = tableEnv.fromDataSet(ds1, “a, b, c”);Table right = tableEnv.fromDataSet(ds2, “a, b, c”);Table result = left.union(right);union方法类似sql的unionUnionAllTable left = tableEnv.fromDataSet(ds1, “a, b, c”);Table right = tableEnv.fromDataSet(ds2, “a, b, c”);Table result = left.unionAll(right);unionAll方法类似sql的union allIntersectTable left = tableEnv.fromDataSet(ds1, “a, b, c”);Table right = tableEnv.fromDataSet(ds2, “d, e, f”);Table result = left.intersect(right);intersect方法类似sql的intersectIntersectAllTable left = tableEnv.fromDataSet(ds1, “a, b, c”);Table right = tableEnv.fromDataSet(ds2, “d, e, f”);Table result = left.intersectAll(right);intersectAll方法类似sql的intersect allMinusTable left = tableEnv.fromDataSet(ds1, “a, b, c”);Table right = tableEnv.fromDataSet(ds2, “a, b, c”);Table result = left.minus(right);minus方法类似sql的exceptMinusAllTable left = tableEnv.fromDataSet(ds1, “a, b, c”);Table right = tableEnv.fromDataSet(ds2, “a, b, c”);Table result = left.minusAll(right);minusAll方法类似sql的except allInTable left = ds1.toTable(tableEnv, “a, b, c”);Table right = ds2.toTable(tableEnv, “a”);// using implicit registrationTable result = left.select(“a, b, c”).where(“a.in(” + right + “)”);// using explicit registrationtableEnv.registerTable(“RightTable”, right);Table result = left.select(“a, b, c”).where(“a.in(RightTable)”);in方法类似sql的inTableflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass Table( private[flink] val tableEnv: TableEnvironment, private[flink] val logicalPlan: LogicalNode) { //…… def union(right: Table): Table = { // check that right table belongs to the same TableEnvironment if (right.tableEnv != this.tableEnv) { throw new ValidationException(“Only tables from the same TableEnvironment can be unioned.”) } new Table(tableEnv, Union(logicalPlan, right.logicalPlan, all = false).validate(tableEnv)) } def unionAll(right: Table): Table = { // check that right table belongs to the same TableEnvironment if (right.tableEnv != this.tableEnv) { throw new ValidationException(“Only tables from the same TableEnvironment can be unioned.”) } new Table(tableEnv, Union(logicalPlan, right.logicalPlan, all = true).validate(tableEnv)) } def intersect(right: Table): Table = { // check that right table belongs to the same TableEnvironment if (right.tableEnv != this.tableEnv) { throw new ValidationException( “Only tables from the same TableEnvironment can be intersected.”) } new Table(tableEnv, Intersect(logicalPlan, right.logicalPlan, all = false).validate(tableEnv)) } def intersectAll(right: Table): Table = { // check that right table belongs to the same TableEnvironment if (right.tableEnv != this.tableEnv) { throw new ValidationException( “Only tables from the same TableEnvironment can be intersected.”) } new Table(tableEnv, Intersect(logicalPlan, right.logicalPlan, all = true).validate(tableEnv)) } def minus(right: Table): Table = { // check that right table belongs to the same TableEnvironment if (right.tableEnv != this.tableEnv) { throw new ValidationException(“Only tables from the same TableEnvironment can be " + “subtracted.”) } new Table(tableEnv, Minus(logicalPlan, right.logicalPlan, all = false) .validate(tableEnv)) } def minusAll(right: Table): Table = { // check that right table belongs to the same TableEnvironment if (right.tableEnv != this.tableEnv) { throw new ValidationException(“Only tables from the same TableEnvironment can be " + “subtracted.”) } new Table(tableEnv, Minus(logicalPlan, right.logicalPlan, all = true) .validate(tableEnv)) } //……}union及unionAll使用的是Union，intersect及intersectAll使用的是Intersect，minus及minusAll使用的是MinusUnionflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Union(left: LogicalNode, right: LogicalNode, all: Boolean) extends BinaryNode { override def output: Seq[Attribute] = left.output override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { left.construct(relBuilder) right.construct(relBuilder) relBuilder.union(all) } override def validate(tableEnv: TableEnvironment): LogicalNode = { if (tableEnv.isInstanceOf[StreamTableEnvironment] && !all) { failValidation(s"Union on stream tables is currently not supported.”) } val resolvedUnion = super.validate(tableEnv).asInstanceOf[Union] if (left.output.length != right.output.length) { failValidation(s"Union two tables of different column sizes:” + s" ${left.output.size} and ${right.output.size}") } val sameSchema = left.output.zip(right.output).forall { case (l, r) => l.resultType == r.resultType } if (!sameSchema) { failValidation(s"Union two tables of different schema:" + s" [${left.output.map(a => (a.name, a.resultType)).mkString(", “)}] and” + s" [${right.output.map(a => (a.name, a.resultType)).mkString(", “)}]”) } resolvedUnion }}Union继承了BinaryNode，其construct方法通过relBuilder.union来构建union操作Intersectflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Intersect(left: LogicalNode, right: LogicalNode, all: Boolean) extends BinaryNode { override def output: Seq[Attribute] = left.output override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { left.construct(relBuilder) right.construct(relBuilder) relBuilder.intersect(all) } override def validate(tableEnv: TableEnvironment): LogicalNode = { if (tableEnv.isInstanceOf[StreamTableEnvironment]) { failValidation(s"Intersect on stream tables is currently not supported.") } val resolvedIntersect = super.validate(tableEnv).asInstanceOf[Intersect] if (left.output.length != right.output.length) { failValidation(s"Intersect two tables of different column sizes:" + s" ${left.output.size} and ${right.output.size}") } // allow different column names between tables val sameSchema = left.output.zip(right.output).forall { case (l, r) => l.resultType == r.resultType } if (!sameSchema) { failValidation(s"Intersect two tables of different schema:" + s" [${left.output.map(a => (a.name, a.resultType)).mkString(", “)}] and” + s" [${right.output.map(a => (a.name, a.resultType)).mkString(", “)}]”) } resolvedIntersect }}Intersect继承了BinaryNode，其construct方法通过relBuilder.intersect来构建intersect操作Minusflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Minus(left: LogicalNode, right: LogicalNode, all: Boolean) extends BinaryNode { override def output: Seq[Attribute] = left.output override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { left.construct(relBuilder) right.construct(relBuilder) relBuilder.minus(all) } override def validate(tableEnv: TableEnvironment): LogicalNode = { if (tableEnv.isInstanceOf[StreamTableEnvironment]) { failValidation(s"Minus on stream tables is currently not supported.") } val resolvedMinus = super.validate(tableEnv).asInstanceOf[Minus] if (left.output.length != right.output.length) { failValidation(s"Minus two table of different column sizes:" + s" ${left.output.size} and ${right.output.size}") } val sameSchema = left.output.zip(right.output).forall { case (l, r) => l.resultType == r.resultType } if (!sameSchema) { failValidation(s"Minus two table of different schema:" + s" [${left.output.map(a => (a.name, a.resultType)).mkString(", “)}] and” + s" [${right.output.map(a => (a.name, a.resultType)).mkString(", “)}]”) } resolvedMinus }}Minus继承了BinaryNode，其construct方法通过relBuilder.minus来构建minus操作小结Table对Set提供了union、unionAll、intersect、intersectAll、minus、minusAll、in(in在where子句中)操作union及unionAll使用的是Union，intersect及intersectAll使用的是Intersect，minus及minusAll使用的是MinusUnion继承了BinaryNode，其construct方法通过relBuilder.union来构建union操作；Intersect继承了BinaryNode，其construct方法通过relBuilder.intersect来构建intersect操作；Minus继承了BinaryNode，其construct方法通过relBuilder.minus来构建minus操作docSet Operations ...

聊聊flink Table的Joins

序本文主要研究一下flink Table的Joins实例Inner JoinTable left = tableEnv.fromDataSet(ds1, “a, b, c”);Table right = tableEnv.fromDataSet(ds2, “d, e, f”);Table result = left.join(right).where(“a = d”).select(“a, b, e”);join方法即inner joinOuter JoinTable left = tableEnv.fromDataSet(ds1, “a, b, c”);Table right = tableEnv.fromDataSet(ds2, “d, e, f”);Table leftOuterResult = left.leftOuterJoin(right, “a = d”).select(“a, b, e”);Table rightOuterResult = left.rightOuterJoin(right, “a = d”).select(“a, b, e”);Table fullOuterResult = left.fullOuterJoin(right, “a = d”).select(“a, b, e”);outer join分为leftOuterJoin、rightOuterJoin、fullOuterJoin三种Time-windowed JoinTable left = tableEnv.fromDataSet(ds1, “a, b, c, ltime.rowtime”);Table right = tableEnv.fromDataSet(ds2, “d, e, f, rtime.rowtime”);Table result = left.join(right) .where(“a = d && ltime >= rtime - 5.minutes && ltime < rtime + 10.minutes”) .select(“a, b, e, ltime”);time-windowed join需要至少一个等值条件，然后还需要一个与两边时间相关的条件(可以使用<, <=, >=, >)Inner Join with Table Function// register User-Defined Table FunctionTableFunction<String> split = new MySplitUDTF();tableEnv.registerFunction(“split”, split);// joinTable orders = tableEnv.scan(“Orders”);Table result = orders .join(new Table(tableEnv, “split(c)”).as(“s”, “t”, “v”)) .select(“a, b, s, t, v”);Table也可以跟table function进行inner join，如果table function返回空，则table的记录被丢弃Left Outer Join with Table Function// register User-Defined Table FunctionTableFunction<String> split = new MySplitUDTF();tableEnv.registerFunction(“split”, split);// joinTable orders = tableEnv.scan(“Orders”);Table result = orders .leftOuterJoin(new Table(tableEnv, “split(c)”).as(“s”, “t”, “v”)) .select(“a, b, s, t, v”);Table也可以跟table function进行left outer join，如果table function返回空，则table的记录保留，空的部分为null值Join with Temporal TableTable ratesHistory = tableEnv.scan(“RatesHistory”);// register temporal table function with a time attribute and primary keyTemporalTableFunction rates = ratesHistory.createTemporalTableFunction( “r_proctime”, “r_currency”);tableEnv.registerFunction(“rates”, rates);// join with “Orders” based on the time attribute and keyTable orders = tableEnv.scan(“Orders”);Table result = orders .join(new Table(tEnv, “rates(o_proctime)”), “o_currency = r_currency”)Table也可以跟Temporal tables进行join，Temporal tables通过Table的createTemporalTableFunction而来，目前仅仅支持inner join的方式Tableflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass Table( private[flink] val tableEnv: TableEnvironment, private[flink] val logicalPlan: LogicalNode) { //…… def join(right: Table): Table = { join(right, None, JoinType.INNER) } def join(right: Table, joinPredicate: String): Table = { join(right, joinPredicate, JoinType.INNER) } def join(right: Table, joinPredicate: Expression): Table = { join(right, Some(joinPredicate), JoinType.INNER) } def leftOuterJoin(right: Table): Table = { join(right, None, JoinType.LEFT_OUTER) } def leftOuterJoin(right: Table, joinPredicate: String): Table = { join(right, joinPredicate, JoinType.LEFT_OUTER) } def leftOuterJoin(right: Table, joinPredicate: Expression): Table = { join(right, Some(joinPredicate), JoinType.LEFT_OUTER) } def rightOuterJoin(right: Table, joinPredicate: String): Table = { join(right, joinPredicate, JoinType.RIGHT_OUTER) } def rightOuterJoin(right: Table, joinPredicate: Expression): Table = { join(right, Some(joinPredicate), JoinType.RIGHT_OUTER) } def fullOuterJoin(right: Table, joinPredicate: String): Table = { join(right, joinPredicate, JoinType.FULL_OUTER) } def fullOuterJoin(right: Table, joinPredicate: Expression): Table = { join(right, Some(joinPredicate), JoinType.FULL_OUTER) } private def join(right: Table, joinPredicate: String, joinType: JoinType): Table = { val joinPredicateExpr = ExpressionParser.parseExpression(joinPredicate) join(right, Some(joinPredicateExpr), joinType) } private def join(right: Table, joinPredicate: Option[Expression], joinType: JoinType): Table = { // check if we join with a table or a table function if (!containsUnboundedUDTFCall(right.logicalPlan)) { // regular table-table join // check that the TableEnvironment of right table is not null // and right table belongs to the same TableEnvironment if (right.tableEnv != this.tableEnv) { throw new ValidationException(“Only tables from the same TableEnvironment can be joined.”) } new Table( tableEnv, Join(this.logicalPlan, right.logicalPlan, joinType, joinPredicate, correlated = false) .validate(tableEnv)) } else { // join with a table function // check join type if (joinType != JoinType.INNER && joinType != JoinType.LEFT_OUTER) { throw new ValidationException( “TableFunctions are currently supported for join and leftOuterJoin.”) } val udtf = right.logicalPlan.asInstanceOf[LogicalTableFunctionCall] val udtfCall = LogicalTableFunctionCall( udtf.functionName, udtf.tableFunction, udtf.parameters, udtf.resultType, udtf.fieldNames, this.logicalPlan ).validate(tableEnv) new Table( tableEnv, Join(this.logicalPlan, udtfCall, joinType, joinPredicate, correlated = true) .validate(tableEnv)) } } //……}Table定义了join、leftOuterJoin、rightOuterJoin、fullOuterJoin方法，其最后都是调用的私有的join方法，其中JoinType用于表达join类型，分别有INNER, LEFT_OUTER, RIGHT_OUTER, FULL_OUTER这几种；另外接收String类型或者Expression的条件表达式，其中String类型最后是被解析为Expression类型；join方法最后是使用Join创建了新的TableJoinflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Join( left: LogicalNode, right: LogicalNode, joinType: JoinType, condition: Option[Expression], correlated: Boolean) extends BinaryNode { override def output: Seq[Attribute] = { left.output ++ right.output } private case class JoinFieldReference( name: String, resultType: TypeInformation[], left: LogicalNode, right: LogicalNode) extends Attribute { val isFromLeftInput: Boolean = left.output.map(.name).contains(name) val (indexInInput, indexInJoin) = if (isFromLeftInput) { val indexInLeft = left.output.map(.name).indexOf(name) (indexInLeft, indexInLeft) } else { val indexInRight = right.output.map(.name).indexOf(name) (indexInRight, indexInRight + left.output.length) } override def toString = s"’$name" override def toRexNode(implicit relBuilder: RelBuilder): RexNode = { // look up type of field val fieldType = relBuilder.field(2, if (isFromLeftInput) 0 else 1, name).getType // create a new RexInputRef with index offset new RexInputRef(indexInJoin, fieldType) } override def withName(newName: String): Attribute = { if (newName == name) { this } else { JoinFieldReference(newName, resultType, left, right) } } } override def resolveExpressions(tableEnv: TableEnvironment): LogicalNode = { val node = super.resolveExpressions(tableEnv).asInstanceOf[Join] val partialFunction: PartialFunction[Expression, Expression] = { case field: ResolvedFieldReference => JoinFieldReference( field.name, field.resultType, left, right) } val resolvedCondition = node.condition.map(.postOrderTransform(partialFunction)) Join(node.left, node.right, node.joinType, resolvedCondition, correlated) } override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { left.construct(relBuilder) right.construct(relBuilder) val corSet = mutable.SetCorrelationId if (correlated) { corSet += relBuilder.peek().getCluster.createCorrel() } relBuilder.join( convertJoinType(joinType), condition.map(.toRexNode(relBuilder)).getOrElse(relBuilder.literal(true)), corSet.asJava) } private def convertJoinType(joinType: JoinType) = joinType match { case JoinType.INNER => JoinRelType.INNER case JoinType.LEFT_OUTER => JoinRelType.LEFT case JoinType.RIGHT_OUTER => JoinRelType.RIGHT case JoinType.FULL_OUTER => JoinRelType.FULL } private def ambiguousName: Set[String] = left.output.map(.name).toSet.intersect(right.output.map(.name).toSet) override def validate(tableEnv: TableEnvironment): LogicalNode = { val resolvedJoin = super.validate(tableEnv).asInstanceOf[Join] if (!resolvedJoin.condition.forall(.resultType == BOOLEAN_TYPE_INFO)) { failValidation(s"Filter operator requires a boolean expression as input, " + s"but ${resolvedJoin.condition} is of type ${resolvedJoin.joinType}") } else if (ambiguousName.nonEmpty) { failValidation(s"join relations with ambiguous names: ${ambiguousName.mkString(", “)}”) } resolvedJoin.condition.foreach(testJoinCondition) resolvedJoin } private def testJoinCondition(expression: Expression): Unit = { def checkIfJoinCondition(exp: BinaryComparison) = exp.children match { case (x: JoinFieldReference) :: (y: JoinFieldReference) :: Nil if x.isFromLeftInput != y.isFromLeftInput => true case _ => false } def checkIfFilterCondition(exp: BinaryComparison) = exp.children match { case (x: JoinFieldReference) :: (y: JoinFieldReference) :: Nil => false case (x: JoinFieldReference) :: () :: Nil => true case () :: (y: JoinFieldReference) :: Nil => true case _ => false } var equiJoinPredicateFound = false // Whether the predicate is literal true. val alwaysTrue = expression match { case x: Literal if x.value.equals(true) => true case _ => false } def validateConditions(exp: Expression, isAndBranch: Boolean): Unit = exp match { case x: And => x.children.foreach(validateConditions(, isAndBranch)) case x: Or => x.children.foreach(validateConditions(_, isAndBranch = false)) case x: EqualTo => if (isAndBranch && checkIfJoinCondition(x)) { equiJoinPredicateFound = true } case x: BinaryComparison => // The boolean literal should be a valid condition type. case x: Literal if x.resultType == Types.BOOLEAN => case x => failValidation( s"Unsupported condition type: ${x.getClass.getSimpleName}. Condition: $x") } validateConditions(expression, isAndBranch = true) // Due to a bug in Apache Calcite (see CALCITE-2004 and FLINK-7865) we cannot accept join // predicates except literal true for TableFunction left outer join. if (correlated && right.isInstanceOf[LogicalTableFunctionCall] && joinType != JoinType.INNER ) { if (!alwaysTrue) failValidation(“TableFunction left outer join predicate can only be " + “empty or literal true.”) } else { if (!equiJoinPredicateFound) { failValidation( s"Invalid join condition: $expression. At least one equi-join predicate is " + s"required.”) } } }}Join继承了BinaryNode，它内部将flink的JoinType转为calcite的JoinRelType类型，construct方法通过relBuilder.join来构建join关系小结Table支持多种形式的join，其中包括Inner Join、Outer Join、Time-windowed Join、Inner Join with Table Function、Left Outer Join with Table Function、Join with Temporal TableTable定义了join、leftOuterJoin、rightOuterJoin、fullOuterJoin方法，其最后都是调用的私有的join方法，其中JoinType用于表达join类型，分别有INNER, LEFT_OUTER, RIGHT_OUTER, FULL_OUTER这几种；另外接收String类型或者Expression的条件表达式，其中String类型最后是被解析为Expression类型；join方法最后是使用Join创建了新的TableJoin继承了BinaryNode，它内部将flink的JoinType转为calcite的JoinRelType类型，construct方法通过relBuilder.join来构建join关系docJoins ...

聊聊flink Table的Over Windows

序本文主要研究一下flink Table的Over Windows实例Table table = input .window([OverWindow w].as(“w”)) // define over window with alias w .select(“a, b.sum over w, c.min over w”); // aggregate over the over window wOver Windows类似SQL的over子句，它可以基于event-time、processing-time或者row-count；具体可以通过Over类来构造，其中必须设置orderBy、preceding及as方法；它有Unbounded及Bounded两大类Unbounded Over Windows实例// Unbounded Event-time over window (assuming an event-time attribute “rowtime”).window(Over.partitionBy(“a”).orderBy(“rowtime”).preceding(“unbounded_range”).as(“w”));// Unbounded Processing-time over window (assuming a processing-time attribute “proctime”).window(Over.partitionBy(“a”).orderBy(“proctime”).preceding(“unbounded_range”).as(“w”));// Unbounded Event-time Row-count over window (assuming an event-time attribute “rowtime”).window(Over.partitionBy(“a”).orderBy(“rowtime”).preceding(“unbounded_row”).as(“w”)); // Unbounded Processing-time Row-count over window (assuming a processing-time attribute “proctime”).window(Over.partitionBy(“a”).orderBy(“proctime”).preceding(“unbounded_row”).as(“w”));对于event-time及processing-time使用unbounded_range来表示Unbounded，对于row-count使用unbounded_row来表示UnboundedBounded Over Windows实例// Bounded Event-time over window (assuming an event-time attribute “rowtime”).window(Over.partitionBy(“a”).orderBy(“rowtime”).preceding(“1.minutes”).as(“w”))// Bounded Processing-time over window (assuming a processing-time attribute “proctime”).window(Over.partitionBy(“a”).orderBy(“proctime”).preceding(“1.minutes”).as(“w”))// Bounded Event-time Row-count over window (assuming an event-time attribute “rowtime”).window(Over.partitionBy(“a”).orderBy(“rowtime”).preceding(“10.rows”).as(“w”)) // Bounded Processing-time Row-count over window (assuming a processing-time attribute “proctime”).window(Over.partitionBy(“a”).orderBy(“proctime”).preceding(“10.rows”).as(“w”))对于event-time及processing-time使用诸如1.minutes来表示Bounded，对于row-count使用诸如10.rows来表示BoundedTable.windowflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass Table( private[flink] val tableEnv: TableEnvironment, private[flink] val logicalPlan: LogicalNode) { //…… @varargs def window(overWindows: OverWindow*): OverWindowedTable = { if (tableEnv.isInstanceOf[BatchTableEnvironment]) { throw new TableException(“Over-windows for batch tables are currently not supported.”) } if (overWindows.size != 1) { throw new TableException(“Over-Windows are currently only supported single window.”) } new OverWindowedTable(this, overWindows.toArray) } //……} Table提供了OverWindow参数的window方法，用来进行Over Windows操作，它创建的是OverWindowedTableOverWindowflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/windows.scala/** * Over window is similar to the traditional OVER SQL. /case class OverWindow( private[flink] val alias: Expression, private[flink] val partitionBy: Seq[Expression], private[flink] val orderBy: Expression, private[flink] val preceding: Expression, private[flink] val following: Expression)OverWindow定义了alias、partitionBy、orderBy、preceding、following属性Overflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/java/windows.scalaobject Over { /* * Specifies the time attribute on which rows are grouped. * * For streaming tables call [[orderBy ‘rowtime or orderBy ‘proctime]] to specify time mode. * * For batch tables, refer to a timestamp or long attribute. / def orderBy(orderBy: String): OverWindowWithOrderBy = { val orderByExpr = ExpressionParser.parseExpression(orderBy) new OverWindowWithOrderBy(ArrayExpression, orderByExpr) } /* * Partitions the elements on some partition keys. * * @param partitionBy some partition keys. * @return A partitionedOver instance that only contains the orderBy method. / def partitionBy(partitionBy: String): PartitionedOver = { val partitionByExpr = ExpressionParser.parseExpressionList(partitionBy).toArray new PartitionedOver(partitionByExpr) }}class OverWindowWithOrderBy( private val partitionByExpr: Array[Expression], private val orderByExpr: Expression) { /* * Set the preceding offset (based on time or row-count intervals) for over window. * * @param preceding preceding offset relative to the current row. * @return this over window / def preceding(preceding: String): OverWindowWithPreceding = { val precedingExpr = ExpressionParser.parseExpression(preceding) new OverWindowWithPreceding(partitionByExpr, orderByExpr, precedingExpr) }}class PartitionedOver(private val partitionByExpr: Array[Expression]) { /* * Specifies the time attribute on which rows are grouped. * * For streaming tables call [[orderBy ‘rowtime or orderBy ‘proctime]] to specify time mode. * * For batch tables, refer to a timestamp or long attribute. / def orderBy(orderBy: String): OverWindowWithOrderBy = { val orderByExpr = ExpressionParser.parseExpression(orderBy) new OverWindowWithOrderBy(partitionByExpr, orderByExpr) }}class OverWindowWithPreceding( private val partitionBy: Seq[Expression], private val orderBy: Expression, private val preceding: Expression) { private[flink] var following: Expression = _ /* * Assigns an alias for this window that the following select() clause can refer to. * * @param alias alias for this over window * @return over window / def as(alias: String): OverWindow = as(ExpressionParser.parseExpression(alias)) /* * Assigns an alias for this window that the following select() clause can refer to. * * @param alias alias for this over window * @return over window / def as(alias: Expression): OverWindow = { // set following to CURRENT_ROW / CURRENT_RANGE if not defined if (null == following) { if (preceding.resultType.isInstanceOf[RowIntervalTypeInfo]) { following = CURRENT_ROW } else { following = CURRENT_RANGE } } OverWindow(alias, partitionBy, orderBy, preceding, following) } /* * Set the following offset (based on time or row-count intervals) for over window. * * @param following following offset that relative to the current row. * @return this over window / def following(following: String): OverWindowWithPreceding = { this.following(ExpressionParser.parseExpression(following)) } /* * Set the following offset (based on time or row-count intervals) for over window. * * @param following following offset that relative to the current row. * @return this over window / def following(following: Expression): OverWindowWithPreceding = { this.following = following this }}Over类是创建over window的帮助类，它提供了orderBy及partitionBy两个方法，分别创建的是OverWindowWithOrderBy及PartitionedOverPartitionedOver提供了orderBy方法，创建的是OverWindowWithOrderBy；OverWindowWithOrderBy提供了preceding方法，创建的是OverWindowWithPrecedingOverWindowWithPreceding则包含了partitionBy、orderBy、preceding属性，它提供了as方法创建OverWindow，另外还提供了following方法用于设置following offsetOverWindowedTableflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass OverWindowedTable( private[flink] val table: Table, private[flink] val overWindows: Array[OverWindow]) { def select(fields: Expression): Table = { val expandedFields = expandProjectList( fields, table.logicalPlan, table.tableEnv) if(fields.exists(.isInstanceOf[WindowProperty])){ throw new ValidationException( “Window start and end properties are not available for Over windows.”) } val expandedOverFields = resolveOverWindows(expandedFields, overWindows, table.tableEnv) new Table( table.tableEnv, Project( expandedOverFields.map(UnresolvedAlias), table.logicalPlan, // required for proper projection push down explicitAlias = true) .validate(table.tableEnv) ) } def select(fields: String): Table = { val fieldExprs = ExpressionParser.parseExpressionList(fields) //get the correct expression for AggFunctionCall val withResolvedAggFunctionCall = fieldExprs.map(replaceAggFunctionCall(, table.tableEnv)) select(withResolvedAggFunctionCall: _*) }}OverWindowedTable构造器需要overWindows参数；它只提供select操作，其中select可以接收String类型的参数，也可以接收Expression类型的参数；String类型的参数会被转换为Expression类型，最后调用的是Expression类型参数的select方法；select方法创建了新的Table，其Project的projectList为expandedOverFields.map(UnresolvedAlias)，而expandedOverFields则通过resolveOverWindows(expandedFields, overWindows, table.tableEnv)得到小结Over Windows类似SQL的over子句，它可以基于event-time、processing-time或者row-count；具体可以通过Over类来构造，其中必须设置orderBy、preceding及as方法；它有Unbounded及Bounded两大类(对于event-time及processing-time使用unbounded_range来表示Unbounded，对于row-count使用unbounded_row来表示Unbounded；对于event-time及processing-time使用诸如1.minutes来表示Bounded，对于row-count使用诸如10.rows来表示Bounded)Table提供了OverWindow参数的window方法，用来进行Over Windows操作，它创建的是OverWindowedTable；OverWindow定义了alias、partitionBy、orderBy、preceding、following属性；Over类是创建over window的帮助类，它提供了orderBy及partitionBy两个方法，分别创建的是OverWindowWithOrderBy及PartitionedOver，而PartitionedOver提供了orderBy方法，创建的是OverWindowWithOrderBy；OverWindowWithOrderBy提供了preceding方法，创建的是OverWindowWithPreceding；OverWindowWithPreceding则包含了partitionBy、orderBy、preceding属性，它提供了as方法创建OverWindow，另外还提供了following方法用于设置following offsetOverWindowedTable构造器需要overWindows参数；它只提供select操作，其中select可以接收String类型的参数，也可以接收Expression类型的参数；String类型的参数会被转换为Expression类型，最后调用的是Expression类型参数的select方法；select方法创建了新的Table，其Project的projectList为expandedOverFields.map(UnresolvedAlias)，而expandedOverFields则通过resolveOverWindows(expandedFields, overWindows, table.tableEnv)得到docOver Windows ...

聊聊flink Table的Group Windows

序本文主要研究一下flink Table的Group Windows实例Table table = input .window([Window w].as(“w”)) // define window with alias w .groupBy(“w”) // group the table by window w .select(“b.sum”); // aggregateTable table = input .window([Window w].as(“w”)) // define window with alias w .groupBy(“w, a”) // group the table by attribute a and window w .select(“a, b.sum”); // aggregateTable table = input .window([Window w].as(“w”)) // define window with alias w .groupBy(“w, a”) // group the table by attribute a and window w .select(“a, w.start, w.end, w.rowtime, b.count”); // aggregate and add window start, end, and rowtime timestampswindow操作可以对Window进行别名，然后可以在groupBy及select中引用，window有start、end、rowtime属性可以用，其中start及rowtime是inclusive的，而end为exclusiveTumbling Windows实例// Tumbling Event-time Window.window(Tumble.over(“10.minutes”).on(“rowtime”).as(“w”));// Tumbling Processing-time Window (assuming a processing-time attribute “proctime”).window(Tumble.over(“10.minutes”).on(“proctime”).as(“w”));// Tumbling Row-count Window (assuming a processing-time attribute “proctime”).window(Tumble.over(“10.rows”).on(“proctime”).as(“w”));Tumbling Windows按固定窗口大小来移动，因而窗口不重叠；over方法用于指定窗口大小；窗口大小可以基于event-time、processing-time、row-count来定义Sliding Windows实例// Sliding Event-time Window.window(Slide.over(“10.minutes”).every(“5.minutes”).on(“rowtime”).as(“w”));// Sliding Processing-time window (assuming a processing-time attribute “proctime”).window(Slide.over(“10.minutes”).every(“5.minutes”).on(“proctime”).as(“w”));// Sliding Row-count window (assuming a processing-time attribute “proctime”).window(Slide.over(“10.rows”).every(“5.rows”).on(“proctime”).as(“w”));Sliding Windows在slide interval小于window size的时候，窗口会有重叠，因而rows可能归属多个窗口；over方法用于指定窗口大小，窗口大小可以基于event-time、processing-time、row-count来定义；every方法用于指定slide intervalSession Windows实例// Session Event-time Window.window(Session.withGap(“10.minutes”).on(“rowtime”).as(“w”));// Session Processing-time Window (assuming a processing-time attribute “proctime”).window(Session.withGap(“10.minutes”).on(“proctime”).as(“w”));Session Windows没有固定的窗口大小，它基于inactivity的程度来关闭窗口，withGap方法用于指定两个窗口的gap，作为time interval；Session Windows只能使用event-time或者processing-timeTable.windowflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass Table( private[flink] val tableEnv: TableEnvironment, private[flink] val logicalPlan: LogicalNode) { //…… def window(window: Window): WindowedTable = { new WindowedTable(this, window) } //……}Table提供了window操作，接收Window参数，创建的是WindowedTableWindowedTableflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass WindowedTable( private[flink] val table: Table, private[flink] val window: Window) { def groupBy(fields: Expression*): WindowGroupedTable = { val fieldsWithoutWindow = fields.filterNot(window.alias.equals(_)) if (fields.size != fieldsWithoutWindow.size + 1) { throw new ValidationException(“GroupBy must contain exactly one window alias.”) } new WindowGroupedTable(table, fieldsWithoutWindow, window) } def groupBy(fields: String): WindowGroupedTable = { val fieldsExpr = ExpressionParser.parseExpressionList(fields) groupBy(fieldsExpr: _) }}WindowedTable只提供groupBy操作，其中groupBy可以接收String类型的参数，也可以接收Expression类型的参数；String类型的参数会被转换为Expression类型，最后调用的是Expression类型参数的groupBy方法；如果groupBy除了window没有其他属性，则其parallelism为1，只会在单一task上执行；groupBy方法创建的是WindowGroupedTableWindowGroupedTableflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass WindowGroupedTable( private[flink] val table: Table, private[flink] val groupKeys: Seq[Expression], private[flink] val window: Window) { def select(fields: Expression): Table = { val expandedFields = expandProjectList(fields, table.logicalPlan, table.tableEnv) val (aggNames, propNames) = extractAggregationsAndProperties(expandedFields, table.tableEnv) val projectsOnAgg = replaceAggregationsAndProperties( expandedFields, table.tableEnv, aggNames, propNames) val projectFields = extractFieldReferences(expandedFields ++ groupKeys :+ window.timeField) new Table(table.tableEnv, Project( projectsOnAgg, WindowAggregate( groupKeys, window.toLogicalWindow, propNames.map(a => Alias(a._1, a._2)).toSeq, aggNames.map(a => Alias(a._1, a.2)).toSeq, Project(projectFields, table.logicalPlan).validate(table.tableEnv) ).validate(table.tableEnv), // required for proper resolution of the time attribute in multi-windows explicitAlias = true ).validate(table.tableEnv)) } def select(fields: String): Table = { val fieldExprs = ExpressionParser.parseExpressionList(fields) //get the correct expression for AggFunctionCall val withResolvedAggFunctionCall = fieldExprs.map(replaceAggFunctionCall(, table.tableEnv)) select(withResolvedAggFunctionCall: _*) }}WindowGroupedTable只提供select操作，其中select可以接收String类型的参数，也可以接收Expression类型的参数；String类型的参数会被转换为Expression类型，最后调用的是Expression类型参数的select方法；select方法创建了新的Table，其Project的child为WindowAggregate小结window操作可以对Window进行别名，然后可以在groupBy及select中引用，window有start、end、rowtime属性可以用，其中start及rowtime是inclusive的，而end为exclusiveTumbling Windows按固定窗口大小来移动，因而窗口不重叠；over方法用于指定窗口大小；窗口大小可以基于event-time、processing-time、row-count来定义；Sliding Windows在slide interval小于window size的时候，窗口会有重叠，因而rows可能归属多个窗口；over方法用于指定窗口大小，窗口大小可以基于event-time、processing-time、row-count来定义；every方法用于指定slide interval；Session Windows没有固定的窗口大小，它基于inactivity的程度来关闭窗口，withGap方法用于指定两个窗口的gap，作为time interval；Session Windows只能使用event-time或者processing-timeTable提供了window操作，接收Window参数，创建的是WindowedTable；WindowedTable只提供groupBy操作，其中groupBy可以接收String类型的参数，也可以接收Expression类型的参数；String类型的参数会被转换为Expression类型，最后调用的是Expression类型参数的groupBy方法；如果groupBy除了window没有其他属性，则其parallelism为1，只会在单一task上执行；groupBy方法创建的是WindowGroupedTable；WindowGroupedTable只提供select操作，其中select可以接收String类型的参数，也可以接收Expression类型的参数；String类型的参数会被转换为Expression类型，最后调用的是Expression类型参数的select方法；select方法创建了新的Table，其Project的child为WindowAggregatedocGroup Windows ...

聊聊flink Table的groupBy操作

序本文主要研究一下flink Table的groupBy操作Table.groupByflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass Table( private[flink] val tableEnv: TableEnvironment, private[flink] val logicalPlan: LogicalNode) { //…… def groupBy(fields: String): GroupedTable = { val fieldsExpr = ExpressionParser.parseExpressionList(fields) groupBy(fieldsExpr: ) } def groupBy(fields: Expression): GroupedTable = { new GroupedTable(this, fields) } //……}Table的groupBy操作支持两种参数，一种是String类型，一种是Expression类型；String参数的方法是将String转换为Expression，最后调用的Expression参数的groupBy方法，该方法创建了GroupedTableGroupedTableflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass GroupedTable( private[flink] val table: Table, private[flink] val groupKey: Seq[Expression]) { def select(fields: Expression*): Table = { val expandedFields = expandProjectList(fields, table.logicalPlan, table.tableEnv) val (aggNames, propNames) = extractAggregationsAndProperties(expandedFields, table.tableEnv) if (propNames.nonEmpty) { throw new ValidationException(“Window properties can only be used on windowed tables.”) } val projectsOnAgg = replaceAggregationsAndProperties( expandedFields, table.tableEnv, aggNames, propNames) val projectFields = extractFieldReferences(expandedFields ++ groupKey) new Table(table.tableEnv, Project(projectsOnAgg, Aggregate(groupKey, aggNames.map(a => Alias(a.1, a.2)).toSeq, Project(projectFields, table.logicalPlan).validate(table.tableEnv) ).validate(table.tableEnv) ).validate(table.tableEnv)) } def select(fields: String): Table = { val fieldExprs = ExpressionParser.parseExpressionList(fields) //get the correct expression for AggFunctionCall val withResolvedAggFunctionCall = fieldExprs.map(replaceAggFunctionCall(, table.tableEnv)) select(withResolvedAggFunctionCall: *) }}GroupedTable有两个属性，一个是原始的Table，一个是Seq[Expression]类型的groupKeyGroupedTable提供两个select方法，参数类型分别为String、Expression，String类型的参数最后也是转为Expression类型select方法使用Project创建新的Table，而Project则是通过Aggregate来创建Aggregateflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Aggregate( groupingExpressions: Seq[Expression], aggregateExpressions: Seq[NamedExpression], child: LogicalNode) extends UnaryNode { override def output: Seq[Attribute] = { (groupingExpressions ++ aggregateExpressions) map { case ne: NamedExpression => ne.toAttribute case e => Alias(e, e.toString).toAttribute } } override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { child.construct(relBuilder) relBuilder.aggregate( relBuilder.groupKey(groupingExpressions.map(.toRexNode(relBuilder)).asJava), aggregateExpressions.map { case Alias(agg: Aggregation, name, ) => agg.toAggCall(name)(relBuilder) case _ => throw new RuntimeException(“This should never happen.”) }.asJava) } override def validate(tableEnv: TableEnvironment): LogicalNode = { implicit val relBuilder: RelBuilder = tableEnv.getRelBuilder val resolvedAggregate = super.validate(tableEnv).asInstanceOf[Aggregate] val groupingExprs = resolvedAggregate.groupingExpressions val aggregateExprs = resolvedAggregate.aggregateExpressions aggregateExprs.foreach(validateAggregateExpression) groupingExprs.foreach(validateGroupingExpression) def validateAggregateExpression(expr: Expression): Unit = expr match { case distinctExpr: DistinctAgg => distinctExpr.child match { case : DistinctAgg => failValidation( “Chained distinct operators are not supported!”) case aggExpr: Aggregation => validateAggregateExpression(aggExpr) case _ => failValidation( “Distinct operator can only be applied to aggregation expressions!”) } // check aggregate function case aggExpr: Aggregation if aggExpr.getSqlAggFunction.requiresOver => failValidation(s"OVER clause is necessary for window functions: [${aggExpr.getClass}].") // check no nested aggregation exists. case aggExpr: Aggregation => aggExpr.children.foreach { child => child.preOrderVisit { case agg: Aggregation => failValidation( “It’s not allowed to use an aggregate function as " + “input of another aggregate function”) case _ => // OK } } case a: Attribute if !groupingExprs.exists(.checkEquals(a)) => failValidation( s"expression ‘$a’ is invalid because it is neither” + " present in group by nor an aggregate function") case e if groupingExprs.exists(.checkEquals(e)) => // OK case e => e.children.foreach(validateAggregateExpression) } def validateGroupingExpression(expr: Expression): Unit = { if (!expr.resultType.isKeyType) { failValidation( s"expression $expr cannot be used as a grouping expression " + “because it’s not a valid key type which must be hashable and comparable”) } } resolvedAggregate }}Aggregate继承了UnaryNode，它接收三个参数，一个是Seq[Expression]类型的groupingExpressions，一个是Seq[NamedExpression]类型的aggregateExpressions，一个是LogicalNode类型的child；construct方法调用了relBuilder.aggregate，传入的RelBuilder.GroupKey参数是通过relBuilder.groupKey构建，而传入的RelBuilder.AggCall参数则是通过aggregateExpressions.map构造而来RelBuilder.groupKeycalcite-core-1.18.0-sources.jar!/org/apache/calcite/tools/RelBuilder.javapublic class RelBuilder { protected final RelOptCluster cluster; protected final RelOptSchema relOptSchema; private final RelFactories.FilterFactory filterFactory; private final RelFactories.ProjectFactory projectFactory; private final RelFactories.AggregateFactory aggregateFactory; private final RelFactories.SortFactory sortFactory; private final RelFactories.ExchangeFactory exchangeFactory; private final RelFactories.SortExchangeFactory sortExchangeFactory; private final RelFactories.SetOpFactory setOpFactory; private final RelFactories.JoinFactory joinFactory; private final RelFactories.SemiJoinFactory semiJoinFactory; private final RelFactories.CorrelateFactory correlateFactory; private final RelFactories.ValuesFactory valuesFactory; private final RelFactories.TableScanFactory scanFactory; private final RelFactories.MatchFactory matchFactory; private final Deque<Frame> stack = new ArrayDeque<>(); private final boolean simplify; private final RexSimplify simplifier; //…… /** Creates an empty group key. / public GroupKey groupKey() { return groupKey(ImmutableList.of()); } /* Creates a group key. / public GroupKey groupKey(RexNode… nodes) { return groupKey(ImmutableList.copyOf(nodes)); } /* Creates a group key. / public GroupKey groupKey(Iterable<? extends RexNode> nodes) { return new GroupKeyImpl(ImmutableList.copyOf(nodes), false, null, null); } /* Creates a group key with grouping sets. */ public GroupKey groupKey(Iterable<? extends RexNode> nodes, Iterable<? extends Iterable<? extends RexNode>> nodeLists) { return groupKey(nodes, false, nodeLists); } /** Creates a group key of fields identified by ordinal. / public GroupKey groupKey(int… fieldOrdinals) { return groupKey(fields(ImmutableIntList.of(fieldOrdinals))); } /* Creates a group key of fields identified by name. */ public GroupKey groupKey(String… fieldNames) { return groupKey(fields(ImmutableList.copyOf(fieldNames))); } public GroupKey groupKey(@Nonnull ImmutableBitSet groupSet) { return groupKey(groupSet, ImmutableList.of(groupSet)); } public GroupKey groupKey(ImmutableBitSet groupSet, @Nonnull Iterable<? extends ImmutableBitSet> groupSets) { return groupKey(groupSet, false, ImmutableList.copyOf(groupSets)); } private GroupKey groupKey_(ImmutableBitSet groupSet, boolean indicator, @Nonnull ImmutableList<ImmutableBitSet> groupSets) { if (groupSet.length() > peek().getRowType().getFieldCount()) { throw new IllegalArgumentException(“out of bounds: " + groupSet); } Objects.requireNonNull(groupSets); final ImmutableList<RexNode> nodes = fields(ImmutableIntList.of(groupSet.toArray())); final List<ImmutableList<RexNode>> nodeLists = Util.transform(groupSets, bitSet -> fields(ImmutableIntList.of(bitSet.toArray()))); return groupKey_(nodes, indicator, nodeLists); } private GroupKey groupKey_(Iterable<? extends RexNode> nodes, boolean indicator, Iterable<? extends Iterable<? extends RexNode>> nodeLists) { final ImmutableList.Builder<ImmutableList<RexNode>> builder = ImmutableList.builder(); for (Iterable<? extends RexNode> nodeList : nodeLists) { builder.add(ImmutableList.copyOf(nodeList)); } return new GroupKeyImpl(ImmutableList.copyOf(nodes), indicator, builder.build(), null); } //……}RelBuilder提供了诸多groupKey方法用于创建GroupKey，其最后调用的是私有方法groupKey_，该方法创建了GroupKeyImplGroupKeycalcite-core-1.18.0-sources.jar!/org/apache/calcite/tools/RelBuilder.java public interface GroupKey { /** Assigns an alias to this group key. * * Used to assign field names in the {@code group} operation. / GroupKey alias(String alias); } /* Implementation of {@link GroupKey}. / protected static class GroupKeyImpl implements GroupKey { final ImmutableList<RexNode> nodes; final boolean indicator; final ImmutableList<ImmutableList<RexNode>> nodeLists; final String alias; GroupKeyImpl(ImmutableList<RexNode> nodes, boolean indicator, ImmutableList<ImmutableList<RexNode>> nodeLists, String alias) { this.nodes = Objects.requireNonNull(nodes); assert !indicator; this.indicator = indicator; this.nodeLists = nodeLists; this.alias = alias; } @Override public String toString() { return alias == null ? nodes.toString() : nodes + " as " + alias; } public GroupKey alias(String alias) { return Objects.equals(this.alias, alias) ? this : new GroupKeyImpl(nodes, indicator, nodeLists, alias); } }GroupKey接口定义了alias方法，用于给group操作的字段别名；GroupKeyImpl是GroupKey接口的实现类，其alias返回的是GroupKeyImplRelBuilder.aggregatecalcite-core-1.18.0-sources.jar!/org/apache/calcite/tools/RelBuilder.javapublic class RelBuilder { protected final RelOptCluster cluster; protected final RelOptSchema relOptSchema; private final RelFactories.FilterFactory filterFactory; private final RelFactories.ProjectFactory projectFactory; private final RelFactories.AggregateFactory aggregateFactory; private final RelFactories.SortFactory sortFactory; private final RelFactories.ExchangeFactory exchangeFactory; private final RelFactories.SortExchangeFactory sortExchangeFactory; private final RelFactories.SetOpFactory setOpFactory; private final RelFactories.JoinFactory joinFactory; private final RelFactories.SemiJoinFactory semiJoinFactory; private final RelFactories.CorrelateFactory correlateFactory; private final RelFactories.ValuesFactory valuesFactory; private final RelFactories.TableScanFactory scanFactory; private final RelFactories.MatchFactory matchFactory; private final Deque<Frame> stack = new ArrayDeque<>(); private final boolean simplify; private final RexSimplify simplifier; //…… /* Creates an {@link Aggregate} with an array of * calls. / public RelBuilder aggregate(GroupKey groupKey, AggCall… aggCalls) { return aggregate(groupKey, ImmutableList.copyOf(aggCalls)); } public RelBuilder aggregate(GroupKey groupKey, List<AggregateCall> aggregateCalls) { return aggregate(groupKey, Lists.transform(aggregateCalls, AggCallImpl2::new)); } /* Creates an {@link Aggregate} with a list of * calls. / public RelBuilder aggregate(GroupKey groupKey, Iterable<AggCall> aggCalls) { final Registrar registrar = new Registrar(); registrar.extraNodes.addAll(fields()); registrar.names.addAll(peek().getRowType().getFieldNames()); final GroupKeyImpl groupKey_ = (GroupKeyImpl) groupKey; final ImmutableBitSet groupSet = ImmutableBitSet.of(registrar.registerExpressions(groupKey_.nodes)); label: if (Iterables.isEmpty(aggCalls) && !groupKey_.indicator) { final RelMetadataQuery mq = peek().getCluster().getMetadataQuery(); if (groupSet.isEmpty()) { final Double minRowCount = mq.getMinRowCount(peek()); if (minRowCount == null || minRowCount < 1D) { // We can’t remove “GROUP BY ()” if there’s a chance the rel could be // empty. break label; } } if (registrar.extraNodes.size() == fields().size()) { final Boolean unique = mq.areColumnsUnique(peek(), groupSet); if (unique != null && unique) { // Rel is already unique. return project(fields(groupSet.asList())); } } final Double maxRowCount = mq.getMaxRowCount(peek()); if (maxRowCount != null && maxRowCount <= 1D) { // If there is at most one row, rel is already unique. return this; } } final ImmutableList<ImmutableBitSet> groupSets; if (groupKey_.nodeLists != null) { final int sizeBefore = registrar.extraNodes.size(); final SortedSet<ImmutableBitSet> groupSetSet = new TreeSet<>(ImmutableBitSet.ORDERING); for (ImmutableList<RexNode> nodeList : groupKey_.nodeLists) { final ImmutableBitSet groupSet2 = ImmutableBitSet.of(registrar.registerExpressions(nodeList)); if (!groupSet.contains(groupSet2)) { throw new IllegalArgumentException(“group set element " + nodeList + " must be a subset of group key”); } groupSetSet.add(groupSet2); } groupSets = ImmutableList.copyOf(groupSetSet); if (registrar.extraNodes.size() > sizeBefore) { throw new IllegalArgumentException( “group sets contained expressions not in group key: " + registrar.extraNodes.subList(sizeBefore, registrar.extraNodes.size())); } } else { groupSets = ImmutableList.of(groupSet); } for (AggCall aggCall : aggCalls) { if (aggCall instanceof AggCallImpl) { final AggCallImpl aggCall1 = (AggCallImpl) aggCall; registrar.registerExpressions(aggCall1.operands); if (aggCall1.filter != null) { registrar.registerExpression(aggCall1.filter); } } } project(registrar.extraNodes); rename(registrar.names); final Frame frame = stack.pop(); final RelNode r = frame.rel; final List<AggregateCall> aggregateCalls = new ArrayList<>(); for (AggCall aggCall : aggCalls) { final AggregateCall aggregateCall; if (aggCall instanceof AggCallImpl) { final AggCallImpl aggCall1 = (AggCallImpl) aggCall; final List<Integer> args = registrar.registerExpressions(aggCall1.operands); final int filterArg = aggCall1.filter == null ? -1 : registrar.registerExpression(aggCall1.filter); if (aggCall1.distinct && !aggCall1.aggFunction.isQuantifierAllowed()) { throw new IllegalArgumentException(“DISTINCT not allowed”); } if (aggCall1.filter != null && !aggCall1.aggFunction.allowsFilter()) { throw new IllegalArgumentException(“FILTER not allowed”); } RelCollation collation = RelCollations.of(aggCall1.orderKeys .stream() .map(orderKey -> collation(orderKey, RelFieldCollation.Direction.ASCENDING, null, Collections.emptyList())) .collect(Collectors.toList())); aggregateCall = AggregateCall.create(aggCall1.aggFunction, aggCall1.distinct, aggCall1.approximate, args, filterArg, collation, groupSet.cardinality(), r, null, aggCall1.alias); } else { aggregateCall = ((AggCallImpl2) aggCall).aggregateCall; } aggregateCalls.add(aggregateCall); } assert ImmutableBitSet.ORDERING.isStrictlyOrdered(groupSets) : groupSets; for (ImmutableBitSet set : groupSets) { assert groupSet.contains(set); } RelNode aggregate = aggregateFactory.createAggregate(r, groupKey_.indicator, groupSet, groupSets, aggregateCalls); // build field list final ImmutableList.Builder<Field> fields = ImmutableList.builder(); final List<RelDataTypeField> aggregateFields = aggregate.getRowType().getFieldList(); int i = 0; // first, group fields for (Integer groupField : groupSet.asList()) { RexNode node = registrar.extraNodes.get(groupField); final SqlKind kind = node.getKind(); switch (kind) { case INPUT_REF: fields.add(frame.fields.get(((RexInputRef) node).getIndex())); break; default: String name = aggregateFields.get(i).getName(); RelDataTypeField fieldType = new RelDataTypeFieldImpl(name, i, node.getType()); fields.add(new Field(ImmutableSet.of(), fieldType)); break; } i++; } // second, indicator fields (copy from aggregate rel type) if (groupKey_.indicator) { for (int j = 0; j < groupSet.cardinality(); ++j) { final RelDataTypeField field = aggregateFields.get(i); final RelDataTypeField fieldType = new RelDataTypeFieldImpl(field.getName(), i, field.getType()); fields.add(new Field(ImmutableSet.of(), fieldType)); i++; } } // third, aggregate fields. retain `i’ as field index for (int j = 0; j < aggregateCalls.size(); ++j) { final AggregateCall call = aggregateCalls.get(j); final RelDataTypeField fieldType = new RelDataTypeFieldImpl(aggregateFields.get(i + j).getName(), i + j, call.getType()); fields.add(new Field(ImmutableSet.of(), fieldType)); } stack.push(new Frame(aggregate, fields.build())); return this; } //……}RelBuilder的aggregate操作接收两个参数，一个是GroupKey，一个是集合类型的AggCall；其中AggCall最后是转换为AggregateCall，然后通过aggregateFactory.createAggregate方法取出stack队首的Frame，创建新的RelNode，构造新的Frame，然后重新放入stack的队首RelFactories.AggregateFactory.createAggregatecalcite-core-1.18.0-sources.jar!/org/apache/calcite/rel/core/RelFactories.javapublic class RelFactories { //…… public static final AggregateFactory DEFAULT_AGGREGATE_FACTORY = new AggregateFactoryImpl(); public interface AggregateFactory { /* Creates an aggregate. */ RelNode createAggregate(RelNode input, boolean indicator, ImmutableBitSet groupSet, ImmutableList<ImmutableBitSet> groupSets, List<AggregateCall> aggCalls); } private static class AggregateFactoryImpl implements AggregateFactory { @SuppressWarnings(“deprecation”) public RelNode createAggregate(RelNode input, boolean indicator, ImmutableBitSet groupSet, ImmutableList<ImmutableBitSet> groupSets, List<AggregateCall> aggCalls) { return LogicalAggregate.create(input, indicator, groupSet, groupSets, aggCalls); } } //……}RelFactories定义了AggregateFactory接口，该接口定义了createAggregate方法，用于将一系列的AggregateCall操作转为新的RelNode；AggregateFactoryImpl是AggregateFactory接口的实现类，它的createAggregate方法调用的是LogicalAggregate.create方法LogicalAggregate.createcalcite-core-1.18.0-sources.jar!/org/apache/calcite/rel/logical/LogicalAggregate.javapublic final class LogicalAggregate extends Aggregate { //…… public static LogicalAggregate create(final RelNode input, ImmutableBitSet groupSet, List<ImmutableBitSet> groupSets, List<AggregateCall> aggCalls) { return create_(input, false, groupSet, groupSets, aggCalls); } @Deprecated // to be removed before 2.0 public static LogicalAggregate create(final RelNode input, boolean indicator, ImmutableBitSet groupSet, List<ImmutableBitSet> groupSets, List<AggregateCall> aggCalls) { return create_(input, indicator, groupSet, groupSets, aggCalls); } private static LogicalAggregate create_(final RelNode input, boolean indicator, ImmutableBitSet groupSet, List<ImmutableBitSet> groupSets, List<AggregateCall> aggCalls) { final RelOptCluster cluster = input.getCluster(); final RelTraitSet traitSet = cluster.traitSetOf(Convention.NONE); return new LogicalAggregate(cluster, traitSet, input, indicator, groupSet, groupSets, aggCalls); } //……}LogicalAggregate的create方法创建的是LogicalAggregate小结Table的groupBy操作支持两种参数，一种是String类型，一种是Expression类型；String参数的方法是将String转换为Expression，最后调用的Expression参数的groupBy方法，该方法创建了GroupedTableGroupedTable有两个属性，一个是原始的Table，一个是Seq[Expression]类型的groupKey；它提供两个select方法，参数类型分别为String、Expression，String类型的参数最后也是转为Expression类型；select方法使用Project创建新的Table，而Project则是通过Aggregate来创建Aggregate继承了UnaryNode，它接收三个参数，一个是Seq[Expression]类型的groupingExpressions，一个是Seq[NamedExpression]类型的aggregateExpressions，一个是LogicalNode类型的child；construct方法调用了relBuilder.aggregate，传入的RelBuilder.GroupKey参数是通过relBuilder.groupKey构建，而传入的RelBuilder.AggCall参数则是通过aggregateExpressions.map构造而来RelBuilder的aggregate操作接收两个参数，一个是GroupKey(GroupKey接口定义了alias方法，用于给group操作的字段别名；GroupKeyImpl是GroupKey接口的实现类，其alias返回的是GroupKeyImpl)，一个是集合类型的AggCall；其中AggCall最后是转换为AggregateCall，然后通过aggregateFactory.createAggregate方法取出stack队首的Frame，创建新的RelNode，构造新的Frame，然后重新放入stack的队首RelFactories定义了AggregateFactory接口，该接口定义了createAggregate方法，用于将一系列的AggregateCall操作转为新的RelNode；AggregateFactoryImpl是AggregateFactory接口的实现类，它的createAggregate方法调用的是LogicalAggregate.create方法，创建的是LogicalAggregatedocAggregations ...

聊聊flink Table的where及filter操作

序本文主要研究一下flink Table的where及filter操作Tableflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass Table( private[flink] val tableEnv: TableEnvironment, private[flink] val logicalPlan: LogicalNode) { //…… def where(predicate: String): Table = { filter(predicate) } def where(predicate: Expression): Table = { filter(predicate) } def filter(predicate: String): Table = { val predicateExpr = ExpressionParser.parseExpression(predicate) filter(predicateExpr) } def filter(predicate: Expression): Table = { new Table(tableEnv, Filter(predicate, logicalPlan).validate(tableEnv)) } //……}Table的where及filter操作均有两中方法，一种是String参数，一种是Expression参数；而where方法内部是调用filter方法；filter方法使用Filter(predicate, logicalPlan).validate(tableEnv)创建了新的Table；String参数最后是通过ExpressionParser.parseExpression方法转换为Expression类型Filterflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Filter(condition: Expression, child: LogicalNode) extends UnaryNode { override def output: Seq[Attribute] = child.output override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { child.construct(relBuilder) relBuilder.filter(condition.toRexNode(relBuilder)) } override def validate(tableEnv: TableEnvironment): LogicalNode = { val resolvedFilter = super.validate(tableEnv).asInstanceOf[Filter] if (resolvedFilter.condition.resultType != BOOLEAN_TYPE_INFO) { failValidation(s"Filter operator requires a boolean expression as input," + s" but ${resolvedFilter.condition} is of type ${resolvedFilter.condition.resultType}") } resolvedFilter }}Filter对象继承了UnaryNode，它覆盖了output、construct、validate等方法；construct方法先通过Expression.toRexNode将flink的Expression转换为Apache Calcite的RexNode，然后再执行Apache Calcite的RelBuilder的filter方法RexNodecalcite-core-1.18.0-sources.jar!/org/apache/calcite/rex/RexNode.javapublic abstract class RexNode { //~ Instance fields ——————————————————– // Effectively final. Set in each sub-class constructor, and never re-set. protected String digest; //~ Methods —————————————————————- public abstract RelDataType getType(); public boolean isAlwaysTrue() { return false; } public boolean isAlwaysFalse() { return false; } public boolean isA(SqlKind kind) { return getKind() == kind; } public boolean isA(Collection<SqlKind> kinds) { return getKind().belongsTo(kinds); } public SqlKind getKind() { return SqlKind.OTHER; } public String toString() { return digest; } public abstract <R> R accept(RexVisitor<R> visitor); public abstract <R, P> R accept(RexBiVisitor<R, P> visitor, P arg); @Override public abstract boolean equals(Object obj); @Override public abstract int hashCode();}RexNode是Row expression，可以通过RexBuilder来创建；它有很多子类，比如RexCall、RexVariable、RexFieldAccess等RelBuilder.filtercalcite-core-1.18.0-sources.jar!/org/apache/calcite/tools/RelBuilder.javapublic class RelBuilder { protected final RelOptCluster cluster; protected final RelOptSchema relOptSchema; private final RelFactories.FilterFactory filterFactory; private final RelFactories.ProjectFactory projectFactory; private final RelFactories.AggregateFactory aggregateFactory; private final RelFactories.SortFactory sortFactory; private final RelFactories.ExchangeFactory exchangeFactory; private final RelFactories.SortExchangeFactory sortExchangeFactory; private final RelFactories.SetOpFactory setOpFactory; private final RelFactories.JoinFactory joinFactory; private final RelFactories.SemiJoinFactory semiJoinFactory; private final RelFactories.CorrelateFactory correlateFactory; private final RelFactories.ValuesFactory valuesFactory; private final RelFactories.TableScanFactory scanFactory; private final RelFactories.MatchFactory matchFactory; private final Deque<Frame> stack = new ArrayDeque<>(); private final boolean simplify; private final RexSimplify simplifier; protected RelBuilder(Context context, RelOptCluster cluster, RelOptSchema relOptSchema) { this.cluster = cluster; this.relOptSchema = relOptSchema; if (context == null) { context = Contexts.EMPTY_CONTEXT; } this.simplify = Hook.REL_BUILDER_SIMPLIFY.get(true); this.aggregateFactory = Util.first(context.unwrap(RelFactories.AggregateFactory.class), RelFactories.DEFAULT_AGGREGATE_FACTORY); this.filterFactory = Util.first(context.unwrap(RelFactories.FilterFactory.class), RelFactories.DEFAULT_FILTER_FACTORY); this.projectFactory = Util.first(context.unwrap(RelFactories.ProjectFactory.class), RelFactories.DEFAULT_PROJECT_FACTORY); this.sortFactory = Util.first(context.unwrap(RelFactories.SortFactory.class), RelFactories.DEFAULT_SORT_FACTORY); this.exchangeFactory = Util.first(context.unwrap(RelFactories.ExchangeFactory.class), RelFactories.DEFAULT_EXCHANGE_FACTORY); this.sortExchangeFactory = Util.first(context.unwrap(RelFactories.SortExchangeFactory.class), RelFactories.DEFAULT_SORT_EXCHANGE_FACTORY); this.setOpFactory = Util.first(context.unwrap(RelFactories.SetOpFactory.class), RelFactories.DEFAULT_SET_OP_FACTORY); this.joinFactory = Util.first(context.unwrap(RelFactories.JoinFactory.class), RelFactories.DEFAULT_JOIN_FACTORY); this.semiJoinFactory = Util.first(context.unwrap(RelFactories.SemiJoinFactory.class), RelFactories.DEFAULT_SEMI_JOIN_FACTORY); this.correlateFactory = Util.first(context.unwrap(RelFactories.CorrelateFactory.class), RelFactories.DEFAULT_CORRELATE_FACTORY); this.valuesFactory = Util.first(context.unwrap(RelFactories.ValuesFactory.class), RelFactories.DEFAULT_VALUES_FACTORY); this.scanFactory = Util.first(context.unwrap(RelFactories.TableScanFactory.class), RelFactories.DEFAULT_TABLE_SCAN_FACTORY); this.matchFactory = Util.first(context.unwrap(RelFactories.MatchFactory.class), RelFactories.DEFAULT_MATCH_FACTORY); final RexExecutor executor = Util.first(context.unwrap(RexExecutor.class), Util.first(cluster.getPlanner().getExecutor(), RexUtil.EXECUTOR)); final RelOptPredicateList predicates = RelOptPredicateList.EMPTY; this.simplifier = new RexSimplify(cluster.getRexBuilder(), predicates, executor); } public RelBuilder filter(RexNode… predicates) { return filter(ImmutableList.copyOf(predicates)); } public RelBuilder filter(Iterable<? extends RexNode> predicates) { final RexNode simplifiedPredicates = simplifier.simplifyFilterPredicates(predicates); if (simplifiedPredicates == null) { return empty(); } if (!simplifiedPredicates.isAlwaysTrue()) { final Frame frame = stack.pop(); final RelNode filter = filterFactory.createFilter(frame.rel, simplifiedPredicates); stack.push(new Frame(filter, frame.fields)); } return this; } //……}RelBuilder在构造器里头创建了RelFactories.FilterFactory，它提供了两个filter方法，一个是RexNode变长数组参数，一个是RexNode类型的Iterable参数；filter方法首先使用simplifier.simplifyFilterPredicates将RexNode类型的Iterable转为simplifiedPredicates(RexNode)，之后只要simplifiedPredicates.isAlwaysTrue()为false，则取出deque中队首的Frame(LIFO (Last-In-First-Out) stacks)，调用filterFactory.createFilter创建RelNode构造新的Frame，然后重新放入deque的队首Framecalcite-core-1.18.0-sources.jar!/org/apache/calcite/tools/RelBuilder.java private static class Frame { final RelNode rel; final ImmutableList<Field> fields; private Frame(RelNode rel, ImmutableList<Field> fields) { this.rel = rel; this.fields = fields; } private Frame(RelNode rel) { String tableAlias = deriveAlias(rel); ImmutableList.Builder<Field> builder = ImmutableList.builder(); ImmutableSet<String> aliases = tableAlias == null ? ImmutableSet.of() : ImmutableSet.of(tableAlias); for (RelDataTypeField field : rel.getRowType().getFieldList()) { builder.add(new Field(aliases, field)); } this.rel = rel; this.fields = builder.build(); } private static String deriveAlias(RelNode rel) { if (rel instanceof TableScan) { final List<String> names = rel.getTable().getQualifiedName(); if (!names.isEmpty()) { return Util.last(names); } } return null; } List<RelDataTypeField> fields() { return Pair.right(fields); } }Frame被存放于ArrayDeque中，实际是用于描述上一个操作的关系表达式以及table的别名怎么映射到row type中RelFactories.FilterFactory.createFiltercalcite-core-1.18.0-sources.jar!/org/apache/calcite/rel/core/RelFactories.java public interface FilterFactory { /** Creates a filter. / RelNode createFilter(RelNode input, RexNode condition); } private static class FilterFactoryImpl implements FilterFactory { public RelNode createFilter(RelNode input, RexNode condition) { return LogicalFilter.create(input, condition); } }FilterFactoryImpl实现了FilterFactory接口，createFilter方法执行的是LogicalFilter.create(input, condition)，这里input是RelNode类型(RelNode取的是Frame的rel)，condition是RexNode类型LogicalFiltercalcite-core-1.18.0-sources.jar!/org/apache/calcite/rel/logical/LogicalFilter.javapublic final class LogicalFilter extends Filter { private final ImmutableSet<CorrelationId> variablesSet; /* Creates a LogicalFilter. / public static LogicalFilter create(final RelNode input, RexNode condition) { return create(input, condition, ImmutableSet.of()); } /* Creates a LogicalFilter. */ public static LogicalFilter create(final RelNode input, RexNode condition, ImmutableSet<CorrelationId> variablesSet) { final RelOptCluster cluster = input.getCluster(); final RelMetadataQuery mq = cluster.getMetadataQuery(); final RelTraitSet traitSet = cluster.traitSetOf(Convention.NONE) .replaceIfs(RelCollationTraitDef.INSTANCE, () -> RelMdCollation.filter(mq, input)) .replaceIf(RelDistributionTraitDef.INSTANCE, () -> RelMdDistribution.filter(mq, input)); return new LogicalFilter(cluster, traitSet, input, condition, variablesSet); } //……}LogicalFilter继承了抽象类Filter，Filter继承了SingleRel，SingleRel继承了AbstractRelNode，AbstractRelNode实现了RelNode接口小结Table的where及filter操作均有两中方法，一种是String参数，一种是Expression参数；而where方法内部是调用filter方法；filter方法使用Filter(predicate, logicalPlan).validate(tableEnv)创建了新的Table；String参数最后是通过ExpressionParser.parseExpression方法转换为Expression类型Filter对象继承了UnaryNode，它覆盖了output、construct、validate等方法；construct方法先通过Expression.toRexNode将flink的Expression转换为Apache Calcite的RexNode(RexNode是Row expression，可以通过RexBuilder来创建；它有很多子类，比如RexCall、RexVariable、RexFieldAccess等)，然后再执行Apache Calcite的RelBuilder的filter方法RelBuilder在构造器里头创建了RelFactories.FilterFactory，它提供了两个filter方法，一个是RexNode变长数组参数，一个是RexNode类型的Iterable参数；filter方法首先使用simplifier.simplifyFilterPredicates将RexNode类型的Iterable转为simplifiedPredicates(RexNode)，之后只要simplifiedPredicates.isAlwaysTrue()为false，则取出deque中队首的Frame(LIFO (Last-In-First-Out) stacks，Frame被存放于ArrayDeque中，实际是用于描述上一个操作的关系表达式以及table的别名怎么映射到row type中)，调用filterFactory.createFilter创建RelNode构造新的Frame，然后重新放入deque的队首；FilterFactoryImpl实现了FilterFactory接口，createFilter方法执行的是LogicalFilter.create(input, condition)，这里input是RelNode类型(RelNode取的是Frame的rel)，condition是RexNode类型(RexNode是Row expression，可以通过RexBuilder来创建；它有很多子类，比如RexCall、RexVariable、RexFieldAccess等)；LogicalFilter继承了抽象类Filter，Filter继承了SingleRel，SingleRel继承了AbstractRelNode，AbstractRelNode实现了RelNode接口docOperations ...

聊聊flink Table的select操作

序本文主要研究一下flink Table的select操作Table.selectflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/table.scalaclass Table( private[flink] val tableEnv: TableEnvironment, private[flink] val logicalPlan: LogicalNode) { //…… def select(fields: String): Table = { val fieldExprs = ExpressionParser.parseExpressionList(fields) //get the correct expression for AggFunctionCall val withResolvedAggFunctionCall = fieldExprs.map(replaceAggFunctionCall(, tableEnv)) select(withResolvedAggFunctionCall: ) } def replaceAggFunctionCall(field: Expression, tableEnv: TableEnvironment): Expression = { field match { case l: LeafExpression => l case u: UnaryExpression => val c = replaceAggFunctionCall(u.child, tableEnv) u.makeCopy(Array(c)) case b: BinaryExpression => val l = replaceAggFunctionCall(b.left, tableEnv) val r = replaceAggFunctionCall(b.right, tableEnv) b.makeCopy(Array(l, r)) // Functions calls case c @ Call(name, args) => val function = tableEnv.getFunctionCatalog.lookupFunction(name, args) function match { case a: AggFunctionCall => a case a: Aggregation => a case p: AbstractWindowProperty => p case _ => val newArgs = args.map( (exp: Expression) => replaceAggFunctionCall(exp, tableEnv)) c.makeCopy(Array(name, newArgs)) } // Scala functions case sfc @ ScalarFunctionCall(clazz, args) => val newArgs: Seq[Expression] = args.map( (exp: Expression) => replaceAggFunctionCall(exp, tableEnv)) sfc.makeCopy(Array(clazz, newArgs)) // Array constructor case c @ ArrayConstructor(args) => val newArgs = c.elements .map((exp: Expression) => replaceAggFunctionCall(exp, tableEnv)) c.makeCopy(Array(newArgs)) // Other expressions case e: Expression => e } } def select(fields: Expression): Table = { val expandedFields = expandProjectList(fields, logicalPlan, tableEnv) val (aggNames, propNames) = extractAggregationsAndProperties(expandedFields, tableEnv) if (propNames.nonEmpty) { throw new ValidationException(“Window properties can only be used on windowed tables.”) } if (aggNames.nonEmpty) { val projectsOnAgg = replaceAggregationsAndProperties( expandedFields, tableEnv, aggNames, propNames) val projectFields = extractFieldReferences(expandedFields) new Table(tableEnv, Project(projectsOnAgg, Aggregate(Nil, aggNames.map(a => Alias(a.1, a.2)).toSeq, Project(projectFields, logicalPlan).validate(tableEnv) ).validate(tableEnv) ).validate(tableEnv) ) } else { new Table(tableEnv, Project(expandedFields.map(UnresolvedAlias), logicalPlan).validate(tableEnv)) } } //……}Table提供了两个select方法，一个接收String参数，一个接收Expression参数String参数的select内部先调用ExpressionParser.parseExpressionList解析String，之后再通过replaceAggFunctionCall替换UDAGG function，最后再调用Expression参数的select方法Expression参数的select方法会使用Project重新创建Table，如果有aggregate的话，会创建Aggregate，然后在通过Project包装Expressionflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/expressions/Expression.scalaabstract class Expression extends TreeNode[Expression] { /** * Returns the [[TypeInformation]] for evaluating this expression. * It is sometimes not available until the expression is valid. */ private[flink] def resultType: TypeInformation[] /** * One pass validation of the expression tree in post order. */ private[flink] lazy val valid: Boolean = childrenValid && validateInput().isSuccess private[flink] def childrenValid: Boolean = children.forall(.valid) /** * Check input data types, inputs number or other properties specified by this expression. * Return ValidationSuccess if it pass the check, * or ValidationFailure with supplement message explaining the error. * Note: we should only call this method until childrenValid == true / private[flink] def validateInput(): ValidationResult = ValidationSuccess /* * Convert Expression to its counterpart in Calcite, i.e. RexNode */ private[flink] def toRexNode(implicit relBuilder: RelBuilder): RexNode = throw new UnsupportedOperationException( s"${this.getClass.getName} cannot be transformed to RexNode" ) private[flink] def checkEquals(other: Expression): Boolean = { if (this.getClass != other.getClass) { false } else { def checkEquality(elements1: Seq[Any], elements2: Seq[Any]): Boolean = { elements1.length == elements2.length && elements1.zip(elements2).forall { case (e1: Expression, e2: Expression) => e1.checkEquals(e2) case (t1: Seq[], t2: Seq[]) => checkEquality(t1, t2) case (i1, i2) => i1 == i2 } } val elements1 = this.productIterator.toSeq val elements2 = other.productIterator.toSeq checkEquality(elements1, elements2) } }}abstract class BinaryExpression extends Expression { private[flink] def left: Expression private[flink] def right: Expression private[flink] def children = Seq(left, right)}abstract class UnaryExpression extends Expression { private[flink] def child: Expression private[flink] def children = Seq(child)}abstract class LeafExpression extends Expression { private[flink] val children = Nil}Expression继承了TreeNode，它有三个抽象子类分别是BinaryExpression、UnaryExpression、LeafExpressionProjectflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Project( projectList: Seq[NamedExpression], child: LogicalNode, explicitAlias: Boolean = false) extends UnaryNode { override def output: Seq[Attribute] = projectList.map(.toAttribute) override def resolveExpressions(tableEnv: TableEnvironment): LogicalNode = { val afterResolve = super.resolveExpressions(tableEnv).asInstanceOf[Project] val newProjectList = afterResolve.projectList.zipWithIndex.map { case (e, i) => e match { case u @ UnresolvedAlias(c) => c match { case ne: NamedExpression => ne case expr if !expr.valid => u case c @ Cast(ne: NamedExpression, tp) => Alias(c, s"${ne.name}-$tp") case gcf: GetCompositeField => Alias(gcf, gcf.aliasName().getOrElse(s"c$i")) case other => Alias(other, s"c$i") } case _ => throw new RuntimeException(“This should never be called and probably points to a bug.”) } } Project(newProjectList, child, explicitAlias) } override def validate(tableEnv: TableEnvironment): LogicalNode = { val resolvedProject = super.validate(tableEnv).asInstanceOf[Project] val names: mutable.Set[String] = mutable.Set() def checkName(name: String): Unit = { if (names.contains(name)) { failValidation(s"Duplicate field name $name.") } else { names.add(name) } } resolvedProject.projectList.foreach { case n: Alias => // explicit name checkName(n.name) case r: ResolvedFieldReference => // simple field forwarding checkName(r.name) case _ => // Do nothing } resolvedProject } override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { child.construct(relBuilder) val exprs = if (explicitAlias) { projectList } else { // remove AS expressions, according to Calcite they should not be in a final RexNode projectList.map { case Alias(e: Expression, , ) => e case e: Expression => e } } relBuilder.project( exprs.map(.toRexNode(relBuilder)).asJava, projectList.map(.name).asJava, true) }}Project继承了UnaryNode，它构造器接收Seq[NamedExpression]、LogicalNode、explicitAlias三个参数，其中explicitAlias可选，默认为falseAggregateflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/operators.scalacase class Aggregate( groupingExpressions: Seq[Expression], aggregateExpressions: Seq[NamedExpression], child: LogicalNode) extends UnaryNode { override def output: Seq[Attribute] = { (groupingExpressions ++ aggregateExpressions) map { case ne: NamedExpression => ne.toAttribute case e => Alias(e, e.toString).toAttribute } } override protected[logical] def construct(relBuilder: RelBuilder): RelBuilder = { child.construct(relBuilder) relBuilder.aggregate( relBuilder.groupKey(groupingExpressions.map(.toRexNode(relBuilder)).asJava), aggregateExpressions.map { case Alias(agg: Aggregation, name, ) => agg.toAggCall(name)(relBuilder) case _ => throw new RuntimeException(“This should never happen.”) }.asJava) } override def validate(tableEnv: TableEnvironment): LogicalNode = { implicit val relBuilder: RelBuilder = tableEnv.getRelBuilder val resolvedAggregate = super.validate(tableEnv).asInstanceOf[Aggregate] val groupingExprs = resolvedAggregate.groupingExpressions val aggregateExprs = resolvedAggregate.aggregateExpressions aggregateExprs.foreach(validateAggregateExpression) groupingExprs.foreach(validateGroupingExpression) def validateAggregateExpression(expr: Expression): Unit = expr match { case distinctExpr: DistinctAgg => distinctExpr.child match { case : DistinctAgg => failValidation( “Chained distinct operators are not supported!”) case aggExpr: Aggregation => validateAggregateExpression(aggExpr) case _ => failValidation( “Distinct operator can only be applied to aggregation expressions!”) } // check aggregate function case aggExpr: Aggregation if aggExpr.getSqlAggFunction.requiresOver => failValidation(s"OVER clause is necessary for window functions: [${aggExpr.getClass}].") // check no nested aggregation exists. case aggExpr: Aggregation => aggExpr.children.foreach { child => child.preOrderVisit { case agg: Aggregation => failValidation( “It’s not allowed to use an aggregate function as " + “input of another aggregate function”) case _ => // OK } } case a: Attribute if !groupingExprs.exists(.checkEquals(a)) => failValidation( s"expression ‘$a’ is invalid because it is neither” + " present in group by nor an aggregate function") case e if groupingExprs.exists(.checkEquals(e)) => // OK case e => e.children.foreach(validateAggregateExpression) } def validateGroupingExpression(expr: Expression): Unit = { if (!expr.resultType.isKeyType) { failValidation( s"expression $expr cannot be used as a grouping expression " + “because it’s not a valid key type which must be hashable and comparable”) } } resolvedAggregate }}Aggregate继承了UnaryNode，它构造器接收Seq[Expression]、Seq[NamedExpression]、LogicalNode三个参数LogicalNodeflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/plan/logical/LogicalNode.scalaabstract class LogicalNode extends TreeNode[LogicalNode] { def output: Seq[Attribute] def resolveExpressions(tableEnv: TableEnvironment): LogicalNode = { // resolve references and function calls val exprResolved = expressionPostOrderTransform { case u @ UnresolvedFieldReference(name) => // try resolve a field resolveReference(tableEnv, name).getOrElse(u) case c @ Call(name, children) if c.childrenValid => tableEnv.getFunctionCatalog.lookupFunction(name, children) } exprResolved.expressionPostOrderTransform { case ips: InputTypeSpec if ips.childrenValid => var changed: Boolean = false val newChildren = ips.expectedTypes.zip(ips.children).map { case (tpe, child) => val childType = child.resultType if (childType != tpe && TypeCoercion.canSafelyCast(childType, tpe)) { changed = true Cast(child, tpe) } else { child } }.toArray[AnyRef] if (changed) ips.makeCopy(newChildren) else ips } } final def toRelNode(relBuilder: RelBuilder): RelNode = construct(relBuilder).build() protected[logical] def construct(relBuilder: RelBuilder): RelBuilder def validate(tableEnv: TableEnvironment): LogicalNode = { val resolvedNode = resolveExpressions(tableEnv) resolvedNode.expressionPostOrderTransform { case a: Attribute if !a.valid => val from = children.flatMap(.output).map(.name).mkString(", “) // give helpful error message for null literals if (a.name == “null”) { failValidation(s"Cannot resolve field [${a.name}] given input [$from]. If you want to " + s"express a null literal, use ‘Null(TYPE)’ for typed null expressions. " + s"For example: Null(INT)”) } else { failValidation(s"Cannot resolve field [${a.name}] given input [$from].") } case e: Expression if e.validateInput().isFailure => failValidation(s"Expression $e failed on input check: " + s"${e.validateInput().asInstanceOf[ValidationFailure].message}") } } /** * Resolves the given strings to a [[NamedExpression]] using the input from all child * nodes of this LogicalPlan. / def resolveReference(tableEnv: TableEnvironment, name: String): Option[NamedExpression] = { // try to resolve a field val childrenOutput = children.flatMap(.output) val fieldCandidates = childrenOutput.filter(.name.equalsIgnoreCase(name)) if (fieldCandidates.length > 1) { failValidation(s"Reference $name is ambiguous.") } else if (fieldCandidates.nonEmpty) { return Some(fieldCandidates.head.withName(name)) } // try to resolve a table tableEnv.scanInternal(Array(name)) match { case Some(table) => Some(TableReference(name, table)) case None => None } } /* * Runs [[postOrderTransform]] with rule on all expressions present in this logical node. * * @param rule the rule to be applied to every expression in this logical node. */ def expressionPostOrderTransform(rule: PartialFunction[Expression, Expression]): LogicalNode = { var changed = false def expressionPostOrderTransform(e: Expression): Expression = { val newExpr = e.postOrderTransform(rule) if (newExpr.fastEquals(e)) { e } else { changed = true newExpr } } val newArgs = productIterator.map { case e: Expression => expressionPostOrderTransform(e) case Some(e: Expression) => Some(expressionPostOrderTransform(e)) case seq: Traversable[] => seq.map { case e: Expression => expressionPostOrderTransform(e) case other => other } case r: Resolvable[] => r.resolveExpressions(e => expressionPostOrderTransform(e)) case other: AnyRef => other }.toArray if (changed) makeCopy(newArgs) else this } protected def failValidation(msg: String): Nothing = { throw new ValidationException(msg) }}abstract class LeafNode extends LogicalNode { override def children: Seq[LogicalNode] = Nil}abstract class UnaryNode extends LogicalNode { def child: LogicalNode override def children: Seq[LogicalNode] = child :: Nil}abstract class BinaryNode extends LogicalNode { def left: LogicalNode def right: LogicalNode override def children: Seq[LogicalNode] = left :: right :: Nil}LogicalNode跟Expression一样，也继承了TreeNode，LogicalNode有三个抽象子类，分别是BinaryNode、UnaryNode、LeafNode小结Table提供了两个select方法，一个接收String参数，一个接收Expression参数；String参数的select内部先调用ExpressionParser.parseExpressionList解析String，之后再通过replaceAggFunctionCall替换UDAGG function，最后再调用Expression参数的select方法Expression参数的select方法会使用Project重新创建Table，如果有aggregate的话，会创建Aggregate，然后在通过Project包装Project及Aggregate都是case class，它们都继承了UnaryNode，UnaryNode是LogicalNode的子类；LogicalNode跟Expression一样，也继承了TreeNode；Expression有三个抽象子类分别是BinaryExpression、UnaryExpression、LeafExpression；LogicalNode也有三个抽象子类，分别是BinaryNode、UnaryNode、LeafNodedocOperations ...

聊聊flink TableEnvironment的scan操作

序本文主要研究一下flink TableEnvironment的scan操作实例//Scanning a directly registered tableval tab: Table = tableEnv.scan(“tableName”)//Scanning a table from a registered catalogval tab: Table = tableEnv.scan(“catalogName”, “dbName”, “tableName”)scan操作用于从schema读取指定的table，也可以传入catalogName及dbName从指定的catalog及db读取TableEnvironment.scanflink-table_2.11-1.7.0-sources.jar!/org/apache/flink/table/api/TableEnvironment.scalaabstract class TableEnvironment(val config: TableConfig) { private val internalSchema: CalciteSchema = CalciteSchema.createRootSchema(false, false) private val rootSchema: SchemaPlus = internalSchema.plus() //…… @throws[TableException] @varargs def scan(tablePath: String*): Table = { scanInternal(tablePath.toArray) match { case Some(table) => table case None => throw new TableException(s"Table ‘${tablePath.mkString(".")}’ was not found.") } } private[flink] def scanInternal(tablePath: Array[String]): Option[Table] = { require(tablePath != null && !tablePath.isEmpty, “tablePath must not be null or empty.”) val schemaPaths = tablePath.slice(0, tablePath.length - 1) val schema = getSchema(schemaPaths) if (schema != null) { val tableName = tablePath(tablePath.length - 1) val table = schema.getTable(tableName) if (table != null) { return Some(new Table(this, CatalogNode(tablePath, table.getRowType(typeFactory)))) } } None } private def getSchema(schemaPath: Array[String]): SchemaPlus = { var schema = rootSchema for (schemaName <- schemaPath) { schema = schema.getSubSchema(schemaName) if (schema == null) { return schema } } schema } //……}scan方法内部调用的是scanInternal，scanInternal首先读取catalog及db信息，然后调用getSchema方法来获取schemagetSchema是使用SchemaPlus的getSubSchema来按层次获取SchemaPlus，如果没有指定catalog及db，那么这里返回的是rootSchema获取到schema之后，就可以从tablePath数组获取tableName(数组最后一个元素)，调用SchemaPlus的getTable方法查找Table小结TableEnvironment的scan操作就是从Schema中查找Table，可以使用tableName，或者额外指定catalog及db来查找getSchema是使用SchemaPlus的getSubSchema来按层次获取SchemaPlus，如果没有指定catalog及db，那么这里返回的是rootSchema获取到schema之后，就可以从tablePath数组获取tableName(数组最后一个元素)，调用SchemaPlus的getTable方法查找TabledocTable API ...

聊聊flink的Table API及SQL Programs

序本文主要研究一下flink的Table API及SQL Programs实例// for batch programs use ExecutionEnvironment instead of StreamExecutionEnvironmentStreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();// create a TableEnvironment// for batch programs use BatchTableEnvironment instead of StreamTableEnvironmentStreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);// register a TabletableEnv.registerTable(“table1”, …) // ortableEnv.registerTableSource(“table2”, …); // ortableEnv.registerExternalCatalog(“extCat”, …);// register an output TabletableEnv.registerTableSink(“outputTable”, …);// create a Table from a Table API queryTable tapiResult = tableEnv.scan(“table1”).select(…);// create a Table from a SQL queryTable sqlResult = tableEnv.sqlQuery(“SELECT … FROM table2 … “);// emit a Table API result Table to a TableSink, same for SQL resulttapiResult.insertInto(“outputTable”);// executeenv.execute();本实例展示了flink的Table API及SQL Programs的基本用法Table API实例// get a StreamTableEnvironment, works for BatchTableEnvironment equivalentlyStreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);// register Orders table// scan registered Orders tableTable orders = tableEnv.scan(“Orders”);// compute revenue for all customers from FranceTable revenue = orders .filter(“cCountry === ‘FRANCE’”) .groupBy(“cID, cName”) .select(“cID, cName, revenue.sum AS revSum”);// emit or convert Table// execute query通过tableEnv.scan方法来创建Table，之后使用Table的各种查询apiSQL实例// get a StreamTableEnvironment, works for BatchTableEnvironment equivalentlyStreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);// register Orders table// compute revenue for all customers from FranceTable revenue = tableEnv.sqlQuery( “SELECT cID, cName, SUM(revenue) AS revSum " + “FROM Orders " + “WHERE cCountry = ‘FRANCE’ " + “GROUP BY cID, cName” );// emit or convert Table// execute querysqlQuery内部是使用Apache Calcite来实现的TableSink实例// get a StreamTableEnvironment, works for BatchTableEnvironment equivalentlyStreamTableEnvironment tableEnv = TableEnvironment.getTableEnvironment(env);// register “Orders” table// register “RevenueFrance” output table// compute revenue for all customers from France and emit to “RevenueFrance"tableEnv.sqlUpdate( “INSERT INTO RevenueFrance " + “SELECT cID, cName, SUM(revenue) AS revSum " + “FROM Orders " + “WHERE cCountry = ‘FRANCE’ " + “GROUP BY cID, cName” );// execute query这里使用TableSink注册output table之后，就可以使用TableEnvironment的sqlUpdate或者Table的insertInto输出到table小结flink的Table API及SQL Programs的基本用法首先是创建TableEnvironment(BatchTableEnvironment或者StreamTableEnvironment)，之后就是创建Table或者TableSource并注册到catalog(默认使用的catalog是internal的，也可以自己选择注册external catalog)，然后就进行table的query，之后就是一些转换操作关于Table的创建可以从DataSet、DataStream转换过来；关于Table的查询可以使用api query(scan方法)，也可以使用sql query(sqlQuery方法)，或者是混合使用也可以将查询的Table转换为DataSet或者DataStream进行其他处理；如果输出也是输出到table的话，可以注册TableSink，然后输出到TableSinkdocTable API & SQL Concepts & Common API ...

聊聊flink的AsyncWaitOperator

序本文主要研究一下flink的AsyncWaitOperatorAsyncWaitOperatorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/AsyncWaitOperator.java@Internalpublic class AsyncWaitOperator<IN, OUT> extends AbstractUdfStreamOperator<OUT, AsyncFunction<IN, OUT>> implements OneInputStreamOperator<IN, OUT>, OperatorActions { private static final long serialVersionUID = 1L; private static final String STATE_NAME = “async_wait_operator_state”; /** Capacity of the stream element queue. / private final int capacity; /* Output mode for this operator. / private final AsyncDataStream.OutputMode outputMode; /* Timeout for the async collectors. / private final long timeout; protected transient Object checkpointingLock; /* {@link TypeSerializer} for inputs while making snapshots. / private transient StreamElementSerializer<IN> inStreamElementSerializer; /* Recovered input stream elements. / private transient ListState<StreamElement> recoveredStreamElements; /* Queue to store the currently in-flight stream elements into. / private transient StreamElementQueue queue; /* Pending stream element which could not yet added to the queue. / private transient StreamElementQueueEntry<?> pendingStreamElementQueueEntry; private transient ExecutorService executor; /* Emitter for the completed stream element queue entries. / private transient Emitter<OUT> emitter; /* Thread running the emitter. / private transient Thread emitterThread; public AsyncWaitOperator( AsyncFunction<IN, OUT> asyncFunction, long timeout, int capacity, AsyncDataStream.OutputMode outputMode) { super(asyncFunction); chainingStrategy = ChainingStrategy.ALWAYS; Preconditions.checkArgument(capacity > 0, “The number of concurrent async operation should be greater than 0.”); this.capacity = capacity; this.outputMode = Preconditions.checkNotNull(outputMode, “outputMode”); this.timeout = timeout; } @Override public void setup(StreamTask<?, ?> containingTask, StreamConfig config, Output<StreamRecord<OUT>> output) { super.setup(containingTask, config, output); this.checkpointingLock = getContainingTask().getCheckpointLock(); this.inStreamElementSerializer = new StreamElementSerializer<>( getOperatorConfig().<IN>getTypeSerializerIn1(getUserCodeClassloader())); // create the operators executor for the complete operations of the queue entries this.executor = Executors.newSingleThreadExecutor(); switch (outputMode) { case ORDERED: queue = new OrderedStreamElementQueue( capacity, executor, this); break; case UNORDERED: queue = new UnorderedStreamElementQueue( capacity, executor, this); break; default: throw new IllegalStateException(“Unknown async mode: " + outputMode + ‘.’); } } @Override public void open() throws Exception { super.open(); // create the emitter this.emitter = new Emitter<>(checkpointingLock, output, queue, this); // start the emitter thread this.emitterThread = new Thread(emitter, “AsyncIO-Emitter-Thread (” + getOperatorName() + ‘)’); emitterThread.setDaemon(true); emitterThread.start(); // process stream elements from state, since the Emit thread will start as soon as all // elements from previous state are in the StreamElementQueue, we have to make sure that the // order to open all operators in the operator chain proceeds from the tail operator to the // head operator. if (recoveredStreamElements != null) { for (StreamElement element : recoveredStreamElements.get()) { if (element.isRecord()) { processElement(element.<IN>asRecord()); } else if (element.isWatermark()) { processWatermark(element.asWatermark()); } else if (element.isLatencyMarker()) { processLatencyMarker(element.asLatencyMarker()); } else { throw new IllegalStateException(“Unknown record type " + element.getClass() + " encountered while opening the operator.”); } } recoveredStreamElements = null; } } @Override public void processElement(StreamRecord<IN> element) throws Exception { final StreamRecordQueueEntry<OUT> streamRecordBufferEntry = new StreamRecordQueueEntry<>(element); if (timeout > 0L) { // register a timeout for this AsyncStreamRecordBufferEntry long timeoutTimestamp = timeout + getProcessingTimeService().getCurrentProcessingTime(); final ScheduledFuture<?> timerFuture = getProcessingTimeService().registerTimer( timeoutTimestamp, new ProcessingTimeCallback() { @Override public void onProcessingTime(long timestamp) throws Exception { userFunction.timeout(element.getValue(), streamRecordBufferEntry); } }); // Cancel the timer once we’ve completed the stream record buffer entry. This will remove // the register trigger task streamRecordBufferEntry.onComplete( (StreamElementQueueEntry<Collection<OUT>> value) -> { timerFuture.cancel(true); }, executor); } addAsyncBufferEntry(streamRecordBufferEntry); userFunction.asyncInvoke(element.getValue(), streamRecordBufferEntry); } @Override public void processWatermark(Watermark mark) throws Exception { WatermarkQueueEntry watermarkBufferEntry = new WatermarkQueueEntry(mark); addAsyncBufferEntry(watermarkBufferEntry); } @Override public void snapshotState(StateSnapshotContext context) throws Exception { super.snapshotState(context); ListState<StreamElement> partitionableState = getOperatorStateBackend().getListState(new ListStateDescriptor<>(STATE_NAME, inStreamElementSerializer)); partitionableState.clear(); Collection<StreamElementQueueEntry<?>> values = queue.values(); try { for (StreamElementQueueEntry<?> value : values) { partitionableState.add(value.getStreamElement()); } // add the pending stream element queue entry if the stream element queue is currently full if (pendingStreamElementQueueEntry != null) { partitionableState.add(pendingStreamElementQueueEntry.getStreamElement()); } } catch (Exception e) { partitionableState.clear(); throw new Exception(“Could not add stream element queue entries to operator state " + “backend of operator " + getOperatorName() + ‘.’, e); } } @Override public void initializeState(StateInitializationContext context) throws Exception { super.initializeState(context); recoveredStreamElements = context .getOperatorStateStore() .getListState(new ListStateDescriptor<>(STATE_NAME, inStreamElementSerializer)); } @Override public void close() throws Exception { try { assert(Thread.holdsLock(checkpointingLock)); while (!queue.isEmpty()) { // wait for the emitter thread to output the remaining elements // for that he needs the checkpointing lock and thus we have to free it checkpointingLock.wait(); } } finally { Exception exception = null; try { super.close(); } catch (InterruptedException interrupted) { exception = interrupted; Thread.currentThread().interrupt(); } catch (Exception e) { exception = e; } try { // terminate the emitter, the emitter thread and the executor stopResources(true); } catch (InterruptedException interrupted) { exception = ExceptionUtils.firstOrSuppressed(interrupted, exception); Thread.currentThread().interrupt(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } if (exception != null) { LOG.warn(“Errors occurred while closing the AsyncWaitOperator.”, exception); } } } @Override public void dispose() throws Exception { Exception exception = null; try { super.dispose(); } catch (InterruptedException interrupted) { exception = interrupted; Thread.currentThread().interrupt(); } catch (Exception e) { exception = e; } try { stopResources(false); } catch (InterruptedException interrupted) { exception = ExceptionUtils.firstOrSuppressed(interrupted, exception); Thread.currentThread().interrupt(); } catch (Exception e) { exception = ExceptionUtils.firstOrSuppressed(e, exception); } if (exception != null) { throw exception; } } private void stopResources(boolean waitForShutdown) throws InterruptedException { emitter.stop(); emitterThread.interrupt(); executor.shutdown(); if (waitForShutdown) { try { if (!executor.awaitTermination(365L, TimeUnit.DAYS)) { executor.shutdownNow(); } } catch (InterruptedException e) { executor.shutdownNow(); Thread.currentThread().interrupt(); } / * FLINK-5638: If we have the checkpoint lock we might have to free it for a while so * that the emitter thread can complete/react to the interrupt signal. / if (Thread.holdsLock(checkpointingLock)) { while (emitterThread.isAlive()) { checkpointingLock.wait(100L); } } emitterThread.join(); } else { executor.shutdownNow(); } } private <T> void addAsyncBufferEntry(StreamElementQueueEntry<T> streamElementQueueEntry) throws InterruptedException { assert(Thread.holdsLock(checkpointingLock)); pendingStreamElementQueueEntry = streamElementQueueEntry; while (!queue.tryPut(streamElementQueueEntry)) { // we wait for the emitter to notify us if the queue has space left again checkpointingLock.wait(); } pendingStreamElementQueueEntry = null; } @Override public void failOperator(Throwable throwable) { getContainingTask().getEnvironment().failExternally(throwable); }}AsyncWaitOperator继承了AbstractUdfStreamOperator，覆盖了AbstractUdfStreamOperator的setup、open、initializeState、close、dispose方法；实现了OneInputStreamOperator接口定义的processElement、processWatermark、processLatencyMarker方法；实现了OperatorActions定义的failOperator方法setup方法使用Executors.newSingleThreadExecutor()创建了ExecutorService，之后根据不同的outputMode创建不同的StreamElementQueue(OrderedStreamElementQueue或者UnorderedStreamElementQueue)；open方法使用Emitter创建并启动AsyncIO-Emitter-Thread，另外就是处理recoveredStreamElements，根据不同的类型分别调用processElement、processWatermark、processLatencyMarker方法processElement方法首先根据timeout注册一个timer，在ProcessingTimeCallback的onProcessingTime方法里头执行userFunction.timeout，之后将StreamRecordQueueEntry添加到StreamElementQueue中，最后触发userFunction.asyncInvoke；close和dispose方法会调用stopResources方法来关闭资源，不同的是waitForShutdown参数传值不同，close方法传true，而dispose方法传falseEmitterflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/Emitter.java@Internalpublic class Emitter<OUT> implements Runnable { private static final Logger LOG = LoggerFactory.getLogger(Emitter.class); /* Lock to hold before outputting. / private final Object checkpointLock; /* Output for the watermark elements. / private final Output<StreamRecord<OUT>> output; /* Queue to consume the async results from. / private final StreamElementQueue streamElementQueue; private final OperatorActions operatorActions; /* Output for stream records. / private final TimestampedCollector<OUT> timestampedCollector; private volatile boolean running; public Emitter( final Object checkpointLock, final Output<StreamRecord<OUT>> output, final StreamElementQueue streamElementQueue, final OperatorActions operatorActions) { this.checkpointLock = Preconditions.checkNotNull(checkpointLock, “checkpointLock”); this.output = Preconditions.checkNotNull(output, “output”); this.streamElementQueue = Preconditions.checkNotNull(streamElementQueue, “streamElementQueue”); this.operatorActions = Preconditions.checkNotNull(operatorActions, “operatorActions”); this.timestampedCollector = new TimestampedCollector<>(this.output); this.running = true; } @Override public void run() { try { while (running) { LOG.debug(“Wait for next completed async stream element result.”); AsyncResult streamElementEntry = streamElementQueue.peekBlockingly(); output(streamElementEntry); } } catch (InterruptedException e) { if (running) { operatorActions.failOperator(e); } else { // Thread got interrupted which means that it should shut down LOG.debug(“Emitter thread got interrupted, shutting down.”); } } catch (Throwable t) { operatorActions.failOperator(new Exception(“AsyncWaitOperator’s emitter caught an " + “unexpected throwable.”, t)); } } private void output(AsyncResult asyncResult) throws InterruptedException { if (asyncResult.isWatermark()) { synchronized (checkpointLock) { AsyncWatermarkResult asyncWatermarkResult = asyncResult.asWatermark(); LOG.debug(“Output async watermark.”); output.emitWatermark(asyncWatermarkResult.getWatermark()); // remove the peeked element from the async collector buffer so that it is no longer // checkpointed streamElementQueue.poll(); // notify the main thread that there is again space left in the async collector // buffer checkpointLock.notifyAll(); } } else { AsyncCollectionResult<OUT> streamRecordResult = asyncResult.asResultCollection(); if (streamRecordResult.hasTimestamp()) { timestampedCollector.setAbsoluteTimestamp(streamRecordResult.getTimestamp()); } else { timestampedCollector.eraseTimestamp(); } synchronized (checkpointLock) { LOG.debug(“Output async stream element collection result.”); try { Collection<OUT> resultCollection = streamRecordResult.get(); if (resultCollection != null) { for (OUT result : resultCollection) { timestampedCollector.collect(result); } } } catch (Exception e) { operatorActions.failOperator( new Exception(“An async function call terminated with an exception. " + “Failing the AsyncWaitOperator.”, e)); } // remove the peeked element from the async collector buffer so that it is no longer // checkpointed streamElementQueue.poll(); // notify the main thread that there is again space left in the async collector // buffer checkpointLock.notifyAll(); } } } public void stop() { running = false; }}Emitter实现了Runnable接口，它主要负责从StreamElementQueue取出element，然后输出到TimestampedCollectorEmitter的run方法就是不断循环调用streamElementQueue.peekBlockingly()阻塞获取AsyncResult，获取到之后就调用output方法将result输出出去Emitter的output方法根据asyncResult是否是watermark做不同处理，不是watermark的话，就会将result通过timestampedCollector.collect输出，如果出现异常则调用operatorActions.failOperator传递异常，最后调用streamElementQueue.poll()来移除队首的元素StreamElementQueueflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/queue/StreamElementQueue.java@Internalpublic interface StreamElementQueue { <T> void put(StreamElementQueueEntry<T> streamElementQueueEntry) throws InterruptedException; <T> boolean tryPut(StreamElementQueueEntry<T> streamElementQueueEntry) throws InterruptedException; AsyncResult peekBlockingly() throws InterruptedException; AsyncResult poll() throws InterruptedException; Collection<StreamElementQueueEntry<?>> values() throws InterruptedException; boolean isEmpty(); int size();}StreamElementQueue接口主要定义了AsyncWaitOperator所要用的blocking stream element queue的接口；它定义了put、tryPut、peekBlockingly、poll、values、isEmpty、size方法；StreamElementQueue接口有两个子类分别是UnorderedStreamElementQueue及OrderedStreamElementQueue；队列元素类型为StreamElementQueueEntryUnorderedStreamElementQueueflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/queue/UnorderedStreamElementQueue.java@Internalpublic class UnorderedStreamElementQueue implements StreamElementQueue { private static final Logger LOG = LoggerFactory.getLogger(UnorderedStreamElementQueue.class); /* Capacity of this queue. / private final int capacity; /* Executor to run the onComplete callbacks. / private final Executor executor; /* OperatorActions to signal the owning operator a failure. / private final OperatorActions operatorActions; /* Queue of uncompleted stream element queue entries segmented by watermarks. / private final ArrayDeque<Set<StreamElementQueueEntry<?>>> uncompletedQueue; /* Queue of completed stream element queue entries. / private final ArrayDeque<StreamElementQueueEntry<?>> completedQueue; /* First (chronologically oldest) uncompleted set of stream element queue entries. / private Set<StreamElementQueueEntry<?>> firstSet; // Last (chronologically youngest) uncompleted set of stream element queue entries. New // stream element queue entries are inserted into this set. private Set<StreamElementQueueEntry<?>> lastSet; private volatile int numberEntries; /* Locks and conditions for the blocking queue. / private final ReentrantLock lock; private final Condition notFull; private final Condition hasCompletedEntries; public UnorderedStreamElementQueue( int capacity, Executor executor, OperatorActions operatorActions) { Preconditions.checkArgument(capacity > 0, “The capacity must be larger than 0.”); this.capacity = capacity; this.executor = Preconditions.checkNotNull(executor, “executor”); this.operatorActions = Preconditions.checkNotNull(operatorActions, “operatorActions”); this.uncompletedQueue = new ArrayDeque<>(capacity); this.completedQueue = new ArrayDeque<>(capacity); this.firstSet = new HashSet<>(capacity); this.lastSet = firstSet; this.numberEntries = 0; this.lock = new ReentrantLock(); this.notFull = lock.newCondition(); this.hasCompletedEntries = lock.newCondition(); } @Override public <T> void put(StreamElementQueueEntry<T> streamElementQueueEntry) throws InterruptedException { lock.lockInterruptibly(); try { while (numberEntries >= capacity) { notFull.await(); } addEntry(streamElementQueueEntry); } finally { lock.unlock(); } } @Override public <T> boolean tryPut(StreamElementQueueEntry<T> streamElementQueueEntry) throws InterruptedException { lock.lockInterruptibly(); try { if (numberEntries < capacity) { addEntry(streamElementQueueEntry); LOG.debug(“Put element into unordered stream element queue. New filling degree " + “({}/{}).”, numberEntries, capacity); return true; } else { LOG.debug(“Failed to put element into unordered stream element queue because it " + “was full ({}/{}).”, numberEntries, capacity); return false; } } finally { lock.unlock(); } } @Override public AsyncResult peekBlockingly() throws InterruptedException { lock.lockInterruptibly(); try { while (completedQueue.isEmpty()) { hasCompletedEntries.await(); } LOG.debug(“Peeked head element from unordered stream element queue with filling degree " + “({}/{}).”, numberEntries, capacity); return completedQueue.peek(); } finally { lock.unlock(); } } @Override public AsyncResult poll() throws InterruptedException { lock.lockInterruptibly(); try { while (completedQueue.isEmpty()) { hasCompletedEntries.await(); } numberEntries–; notFull.signalAll(); LOG.debug(“Polled element from unordered stream element queue. New filling degree " + “({}/{}).”, numberEntries, capacity); return completedQueue.poll(); } finally { lock.unlock(); } } @Override public Collection<StreamElementQueueEntry<?>> values() throws InterruptedException { lock.lockInterruptibly(); try { StreamElementQueueEntry<?>[] array = new StreamElementQueueEntry[numberEntries]; array = completedQueue.toArray(array); int counter = completedQueue.size(); for (StreamElementQueueEntry<?> entry: firstSet) { array[counter] = entry; counter++; } for (Set<StreamElementQueueEntry<?>> asyncBufferEntries : uncompletedQueue) { for (StreamElementQueueEntry<?> streamElementQueueEntry : asyncBufferEntries) { array[counter] = streamElementQueueEntry; counter++; } } return Arrays.asList(array); } finally { lock.unlock(); } } @Override public boolean isEmpty() { return numberEntries == 0; } @Override public int size() { return numberEntries; } public void onCompleteHandler(StreamElementQueueEntry<?> streamElementQueueEntry) throws InterruptedException { lock.lockInterruptibly(); try { if (firstSet.remove(streamElementQueueEntry)) { completedQueue.offer(streamElementQueueEntry); while (firstSet.isEmpty() && firstSet != lastSet) { firstSet = uncompletedQueue.poll(); Iterator<StreamElementQueueEntry<?>> it = firstSet.iterator(); while (it.hasNext()) { StreamElementQueueEntry<?> bufferEntry = it.next(); if (bufferEntry.isDone()) { completedQueue.offer(bufferEntry); it.remove(); } } } LOG.debug(“Signal unordered stream element queue has completed entries.”); hasCompletedEntries.signalAll(); } } finally { lock.unlock(); } } private <T> void addEntry(StreamElementQueueEntry<T> streamElementQueueEntry) { assert(lock.isHeldByCurrentThread()); if (streamElementQueueEntry.isWatermark()) { lastSet = new HashSet<>(capacity); if (firstSet.isEmpty()) { firstSet.add(streamElementQueueEntry); } else { Set<StreamElementQueueEntry<?>> watermarkSet = new HashSet<>(1); watermarkSet.add(streamElementQueueEntry); uncompletedQueue.offer(watermarkSet); } uncompletedQueue.offer(lastSet); } else { lastSet.add(streamElementQueueEntry); } streamElementQueueEntry.onComplete( (StreamElementQueueEntry<T> value) -> { try { onCompleteHandler(value); } catch (InterruptedException e) { // The accept executor thread got interrupted. This is probably cause by // the shutdown of the executor. LOG.debug(“AsyncBufferEntry could not be properly completed because the " + “executor thread has been interrupted.”, e); } catch (Throwable t) { operatorActions.failOperator(new Exception(“Could not complete the " + “stream element queue entry: " + value + ‘.’, t)); } }, executor); numberEntries++; }}UnorderedStreamElementQueue实现了StreamElementQueue接口，它emit结果的顺序是无序的，其内部使用了两个ArrayDeque，一个是uncompletedQueue，一个是completedQueuepeekBlockingly方法首先判断completedQueue是否有元素，没有的话则执行hasCompletedEntries.await()，有则执行completedQueue.peek()；put及tryPut都会调用addEntry方法，该方法会往uncompletedQueue队列新增元素，然后同时给每个streamElementQueueEntry的onComplete方法注册一个onCompleteHandleronCompleteHandler方法会将执行完成的streamElementQueueEntry从uncompletedQueue移除，然后添加到completedQueueOrderedStreamElementQueueflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/queue/OrderedStreamElementQueue.java@Internalpublic class OrderedStreamElementQueue implements StreamElementQueue { private static final Logger LOG = LoggerFactory.getLogger(OrderedStreamElementQueue.class); /* Capacity of this queue. / private final int capacity; /* Executor to run the onCompletion callback. / private final Executor executor; /* Operator actions to signal a failure to the operator. / private final OperatorActions operatorActions; /* Lock and conditions for the blocking queue. / private final ReentrantLock lock; private final Condition notFull; private final Condition headIsCompleted; /* Queue for the inserted StreamElementQueueEntries. / private final ArrayDeque<StreamElementQueueEntry<?>> queue; public OrderedStreamElementQueue( int capacity, Executor executor, OperatorActions operatorActions) { Preconditions.checkArgument(capacity > 0, “The capacity must be larger than 0.”); this.capacity = capacity; this.executor = Preconditions.checkNotNull(executor, “executor”); this.operatorActions = Preconditions.checkNotNull(operatorActions, “operatorActions”); this.lock = new ReentrantLock(false); this.headIsCompleted = lock.newCondition(); this.notFull = lock.newCondition(); this.queue = new ArrayDeque<>(capacity); } @Override public AsyncResult peekBlockingly() throws InterruptedException { lock.lockInterruptibly(); try { while (queue.isEmpty() || !queue.peek().isDone()) { headIsCompleted.await(); } LOG.debug(“Peeked head element from ordered stream element queue with filling degree " + “({}/{}).”, queue.size(), capacity); return queue.peek(); } finally { lock.unlock(); } } @Override public AsyncResult poll() throws InterruptedException { lock.lockInterruptibly(); try { while (queue.isEmpty() || !queue.peek().isDone()) { headIsCompleted.await(); } notFull.signalAll(); LOG.debug(“Polled head element from ordered stream element queue. New filling degree " + “({}/{}).”, queue.size() - 1, capacity); return queue.poll(); } finally { lock.unlock(); } } @Override public Collection<StreamElementQueueEntry<?>> values() throws InterruptedException { lock.lockInterruptibly(); try { StreamElementQueueEntry<?>[] array = new StreamElementQueueEntry[queue.size()]; array = queue.toArray(array); return Arrays.asList(array); } finally { lock.unlock(); } } @Override public boolean isEmpty() { return queue.isEmpty(); } @Override public int size() { return queue.size(); } @Override public <T> void put(StreamElementQueueEntry<T> streamElementQueueEntry) throws InterruptedException { lock.lockInterruptibly(); try { while (queue.size() >= capacity) { notFull.await(); } addEntry(streamElementQueueEntry); } finally { lock.unlock(); } } @Override public <T> boolean tryPut(StreamElementQueueEntry<T> streamElementQueueEntry) throws InterruptedException { lock.lockInterruptibly(); try { if (queue.size() < capacity) { addEntry(streamElementQueueEntry); LOG.debug(“Put element into ordered stream element queue. New filling degree " + “({}/{}).”, queue.size(), capacity); return true; } else { LOG.debug(“Failed to put element into ordered stream element queue because it " + “was full ({}/{}).”, queue.size(), capacity); return false; } } finally { lock.unlock(); } } private <T> void addEntry(StreamElementQueueEntry<T> streamElementQueueEntry) { assert(lock.isHeldByCurrentThread()); queue.addLast(streamElementQueueEntry); streamElementQueueEntry.onComplete( (StreamElementQueueEntry<T> value) -> { try { onCompleteHandler(value); } catch (InterruptedException e) { // we got interrupted. This indicates a shutdown of the executor LOG.debug(“AsyncBufferEntry could not be properly completed because the " + “executor thread has been interrupted.”, e); } catch (Throwable t) { operatorActions.failOperator(new Exception(“Could not complete the " + “stream element queue entry: " + value + ‘.’, t)); } }, executor); } private void onCompleteHandler(StreamElementQueueEntry<?> streamElementQueueEntry) throws InterruptedException { lock.lockInterruptibly(); try { if (!queue.isEmpty() && queue.peek().isDone()) { LOG.debug(“Signal ordered stream element queue has completed head element.”); headIsCompleted.signalAll(); } } finally { lock.unlock(); } }}OrderedStreamElementQueue实现了StreamElementQueue接口，它有序地emit结果，它内部有一个ArrayDeque类型的queuepeekBlockingly方法首先判断queue是否有元素而且是执行完成的，没有就执行headIsCompleted.await()，有则执行queue.peek()；put及tryPut都会调用addEntry方法，该方法会执行queue.addLast(streamElementQueueEntry)，然后同时给每个streamElementQueueEntry的onComplete方法注册一个onCompleteHandleronCompleteHandler方法会检测执行完成的元素是否是队列的第一个元素，如果是则执行headIsCompleted.signalAll()AsyncResultflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/queue/AsyncResult.java@Internalpublic interface AsyncResult { boolean isWatermark(); boolean isResultCollection(); AsyncWatermarkResult asWatermark(); <T> AsyncCollectionResult<T> asResultCollection();}AsyncResult接口定义了StreamElementQueue的元素异步返回的结果要实现的方法，该async result可能是watermark，可能是真正的结果StreamElementQueueEntryflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/queue/StreamElementQueueEntry.java@Internalpublic abstract class StreamElementQueueEntry<T> implements AsyncResult { private final StreamElement streamElement; public StreamElementQueueEntry(StreamElement streamElement) { this.streamElement = Preconditions.checkNotNull(streamElement); } public StreamElement getStreamElement() { return streamElement; } public boolean isDone() { return getFuture().isDone(); } public void onComplete( final Consumer<StreamElementQueueEntry<T>> completeFunction, Executor executor) { final StreamElementQueueEntry<T> thisReference = this; getFuture().whenCompleteAsync( // call the complete function for normal completion as well as exceptional completion // see FLINK-6435 (value, throwable) -> completeFunction.accept(thisReference), executor); } protected abstract CompletableFuture<T> getFuture(); @Override public final boolean isWatermark() { return AsyncWatermarkResult.class.isAssignableFrom(getClass()); } @Override public final boolean isResultCollection() { return AsyncCollectionResult.class.isAssignableFrom(getClass()); } @Override public final AsyncWatermarkResult asWatermark() { return (AsyncWatermarkResult) this; } @Override public final <T> AsyncCollectionResult<T> asResultCollection() { return (AsyncCollectionResult<T>) this; }}StreamElementQueueEntry实现了AsyncResult接口，它定义了onComplete方法用于结果完成时的回调处理，同时它还定义了抽象方法getFuture供子类实现；它有两个子类，分别是WatermarkQueueEntry及StreamRecordQueueEntryWatermarkQueueEntryflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/queue/WatermarkQueueEntry.java@Internalpublic class WatermarkQueueEntry extends StreamElementQueueEntry<Watermark> implements AsyncWatermarkResult { private final CompletableFuture<Watermark> future; public WatermarkQueueEntry(Watermark watermark) { super(watermark); this.future = CompletableFuture.completedFuture(watermark); } @Override public Watermark getWatermark() { return (Watermark) getStreamElement(); } @Override protected CompletableFuture<Watermark> getFuture() { return future; }}WatermarkQueueEntry继承了StreamElementQueueEntry，其元素类型为Watermark，同时实现了AsyncWatermarkResult接口StreamRecordQueueEntryflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/async/queue/StreamRecordQueueEntry.java@Internalpublic class StreamRecordQueueEntry<OUT> extends StreamElementQueueEntry<Collection<OUT>> implements AsyncCollectionResult<OUT>, ResultFuture<OUT> { /* Timestamp information. / private final boolean hasTimestamp; private final long timestamp; /* Future containing the collection result. */ private final CompletableFuture<Collection<OUT>> resultFuture; public StreamRecordQueueEntry(StreamRecord<?> streamRecord) { super(streamRecord); hasTimestamp = streamRecord.hasTimestamp(); timestamp = streamRecord.getTimestamp(); resultFuture = new CompletableFuture<>(); } @Override public boolean hasTimestamp() { return hasTimestamp; } @Override public long getTimestamp() { return timestamp; } @Override public Collection<OUT> get() throws Exception { return resultFuture.get(); } @Override protected CompletableFuture<Collection<OUT>> getFuture() { return resultFuture; } @Override public void complete(Collection<OUT> result) { resultFuture.complete(result); } @Override public void completeExceptionally(Throwable error) { resultFuture.completeExceptionally(error); }}StreamRecordQueueEntry继承了StreamElementQueueEntry，同时实现了AsyncCollectionResult、ResultFuture接口小结AsyncWaitOperator继承了AbstractUdfStreamOperator，覆盖了AbstractUdfStreamOperator的setup、open、initializeState、close、dispose方法；实现了OneInputStreamOperator接口定义的processElement、processWatermark、processLatencyMarker方法；实现了OperatorActions定义的failOperator方法；open方法使用Emitter创建并启动AsyncIO-Emitter-ThreadEmitter实现了Runnable接口，它主要负责从StreamElementQueue取出element，然后输出到TimestampedCollector；其run方法就是不断循环调用streamElementQueue.peekBlockingly()阻塞获取AsyncResult，获取到之后就调用output方法将result输出出去StreamElementQueue接口主要定义了AsyncWaitOperator所要用的blocking stream element queue的接口；它定义了put、tryPut、peekBlockingly、poll、values、isEmpty、size方法；StreamElementQueue接口有两个子类分别是UnorderedStreamElementQueue及OrderedStreamElementQueue；队列元素类型为StreamElementQueueEntry，StreamElementQueueEntry实现了AsyncResult接口，它定义了onComplete方法用于结果完成时的回调处理，同时它还定义了抽象方法getFuture供子类实现；它有两个子类，分别是WatermarkQueueEntry及StreamRecordQueueEntrydocAsyncWaitOperator ...

聊聊flink的Async I/O

序本文主要研究一下flink的Async I/O实例// This example implements the asynchronous request and callback with Futures that have the// interface of Java 8’s futures (which is the same one followed by Flink’s Future)/** * An implementation of the ‘AsyncFunction’ that sends requests and sets the callback. /class AsyncDatabaseRequest extends RichAsyncFunction<String, Tuple2<String, String>> { /* The database specific client that can issue concurrent requests with callbacks / private transient DatabaseClient client; @Override public void open(Configuration parameters) throws Exception { client = new DatabaseClient(host, post, credentials); } @Override public void close() throws Exception { client.close(); } @Override public void asyncInvoke(String key, final ResultFuture<Tuple2<String, String>> resultFuture) throws Exception { // issue the asynchronous request, receive a future for result final Future<String> result = client.query(key); // set the callback to be executed once the request by the client is complete // the callback simply forwards the result to the result future CompletableFuture.supplyAsync(new Supplier<String>() { @Override public String get() { try { return result.get(); } catch (InterruptedException | ExecutionException e) { // Normally handled explicitly. return null; } } }).thenAccept( (String dbResult) -> { resultFuture.complete(Collections.singleton(new Tuple2<>(key, dbResult))); }); }}// create the original streamDataStream<String> stream = …;// apply the async I/O transformationDataStream<Tuple2<String, String>> resultStream = AsyncDataStream.unorderedWait(stream, new AsyncDatabaseRequest(), 1000, TimeUnit.MILLISECONDS, 100);本实例展示了flink Async I/O的基本用法，首先是实现AsyncFunction接口，用于编写异步请求逻辑及将结果或异常设置到resultFuture，然后就是使用AsyncDataStream的unorderedWait或orderedWait方法将AsyncFunction作用到DataStream作为transformation；AsyncDataStream的unorderedWait或orderedWait有两个关于async operation的参数，一个是timeout参数用于设置async的超时时间，一个是capacity参数用于指定同一时刻最大允许多少个(并发)async request在执行AsyncFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/async/AsyncFunction.java/* * A function to trigger Async I/O operation. * * For each #asyncInvoke, an async io operation can be triggered, and once it has been done, * the result can be collected by calling {@link ResultFuture#complete}. For each async * operation, its context is stored in the operator immediately after invoking * #asyncInvoke, avoiding blocking for each stream input as long as the internal buffer is not full. * * {@link ResultFuture} can be passed into callbacks or futures to collect the result data. * An error can also be propagate to the async IO operator by * {@link ResultFuture#completeExceptionally(Throwable)}. * * Callback example usage: * * <pre>{@code * public class HBaseAsyncFunc implements AsyncFunction<String, String> { * * public void asyncInvoke(String row, ResultFuture<String> result) throws Exception { * HBaseCallback cb = new HBaseCallback(result); * Get get = new Get(Bytes.toBytes(row)); * hbase.asyncGet(get, cb); * } * } * }</pre> * * Future example usage: * * <pre>{@code * public class HBaseAsyncFunc implements AsyncFunction<String, String> { * * public void asyncInvoke(String row, final ResultFuture<String> result) throws Exception { * Get get = new Get(Bytes.toBytes(row)); * ListenableFuture<Result> future = hbase.asyncGet(get); * Futures.addCallback(future, new FutureCallback<Result>() { * public void onSuccess(Result result) { * List<String> ret = process(result); * result.complete(ret); * } * public void onFailure(Throwable thrown) { * result.completeExceptionally(thrown); * } * }); * } * } * }</pre> * * @param <IN> The type of the input elements. * @param <OUT> The type of the returned elements. /@PublicEvolvingpublic interface AsyncFunction<IN, OUT> extends Function, Serializable { /* * Trigger async operation for each stream input. * * @param input element coming from an upstream task * @param resultFuture to be completed with the result data * @exception Exception in case of a user code error. An exception will make the task fail and * trigger fail-over process. / void asyncInvoke(IN input, ResultFuture<OUT> resultFuture) throws Exception; /* * {@link AsyncFunction#asyncInvoke} timeout occurred. * By default, the result future is exceptionally completed with a timeout exception. * * @param input element coming from an upstream task * @param resultFuture to be completed with the result data / default void timeout(IN input, ResultFuture<OUT> resultFuture) throws Exception { resultFuture.completeExceptionally( new TimeoutException(“Async function call has timed out.”)); }}AsyncFunction接口继承了Function，它定义了asyncInvoke方法以及一个default的timeout方法；asyncInvoke方法执行异步逻辑，然后通过ResultFuture.complete将结果设置到ResultFuture，如果异常则通过ResultFuture.completeExceptionally(Throwable)来传递到ResultFutureRichAsyncFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/async/RichAsyncFunction.java@PublicEvolvingpublic abstract class RichAsyncFunction<IN, OUT> extends AbstractRichFunction implements AsyncFunction<IN, OUT> { private static final long serialVersionUID = 3858030061138121840L; @Override public void setRuntimeContext(RuntimeContext runtimeContext) { Preconditions.checkNotNull(runtimeContext); if (runtimeContext instanceof IterationRuntimeContext) { super.setRuntimeContext( new RichAsyncFunctionIterationRuntimeContext( (IterationRuntimeContext) runtimeContext)); } else { super.setRuntimeContext(new RichAsyncFunctionRuntimeContext(runtimeContext)); } } @Override public abstract void asyncInvoke(IN input, ResultFuture<OUT> resultFuture) throws Exception; //……}RichAsyncFunction继承了AbstractRichFunction，同时声明实现AsyncFunction接口，它不没有实现asyncInvoke，交由子类实现；它覆盖了setRuntimeContext方法，这里使用RichAsyncFunctionRuntimeContext或者RichAsyncFunctionIterationRuntimeContext进行包装RichAsyncFunctionRuntimeContextflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/async/RichAsyncFunction.java /* * A wrapper class for async function’s {@link RuntimeContext}. The async function runtime * context only supports basic operations which are thread safe. Consequently, state access, * accumulators, broadcast variables and the distributed cache are disabled. / private static class RichAsyncFunctionRuntimeContext implements RuntimeContext { private final RuntimeContext runtimeContext; RichAsyncFunctionRuntimeContext(RuntimeContext context) { runtimeContext = Preconditions.checkNotNull(context); } @Override public String getTaskName() { return runtimeContext.getTaskName(); } @Override public MetricGroup getMetricGroup() { return runtimeContext.getMetricGroup(); } @Override public int getNumberOfParallelSubtasks() { return runtimeContext.getNumberOfParallelSubtasks(); } @Override public int getMaxNumberOfParallelSubtasks() { return runtimeContext.getMaxNumberOfParallelSubtasks(); } @Override public int getIndexOfThisSubtask() { return runtimeContext.getIndexOfThisSubtask(); } @Override public int getAttemptNumber() { return runtimeContext.getAttemptNumber(); } @Override public String getTaskNameWithSubtasks() { return runtimeContext.getTaskNameWithSubtasks(); } @Override public ExecutionConfig getExecutionConfig() { return runtimeContext.getExecutionConfig(); } @Override public ClassLoader getUserCodeClassLoader() { return runtimeContext.getUserCodeClassLoader(); } // ———————————————————————————– // Unsupported operations // ———————————————————————————– @Override public DistributedCache getDistributedCache() { throw new UnsupportedOperationException(“Distributed cache is not supported in rich async functions.”); } @Override public <T> ValueState<T> getState(ValueStateDescriptor<T> stateProperties) { throw new UnsupportedOperationException(“State is not supported in rich async functions.”); } @Override public <T> ListState<T> getListState(ListStateDescriptor<T> stateProperties) { throw new UnsupportedOperationException(“State is not supported in rich async functions.”); } @Override public <T> ReducingState<T> getReducingState(ReducingStateDescriptor<T> stateProperties) { throw new UnsupportedOperationException(“State is not supported in rich async functions.”); } @Override public <IN, ACC, OUT> AggregatingState<IN, OUT> getAggregatingState(AggregatingStateDescriptor<IN, ACC, OUT> stateProperties) { throw new UnsupportedOperationException(“State is not supported in rich async functions.”); } @Override public <T, ACC> FoldingState<T, ACC> getFoldingState(FoldingStateDescriptor<T, ACC> stateProperties) { throw new UnsupportedOperationException(“State is not supported in rich async functions.”); } @Override public <UK, UV> MapState<UK, UV> getMapState(MapStateDescriptor<UK, UV> stateProperties) { throw new UnsupportedOperationException(“State is not supported in rich async functions.”); } @Override public <V, A extends Serializable> void addAccumulator(String name, Accumulator<V, A> accumulator) { throw new UnsupportedOperationException(“Accumulators are not supported in rich async functions.”); } @Override public <V, A extends Serializable> Accumulator<V, A> getAccumulator(String name) { throw new UnsupportedOperationException(“Accumulators are not supported in rich async functions.”); } @Override public Map<String, Accumulator<?, ?>> getAllAccumulators() { throw new UnsupportedOperationException(“Accumulators are not supported in rich async functions.”); } @Override public IntCounter getIntCounter(String name) { throw new UnsupportedOperationException(“Int counters are not supported in rich async functions.”); } @Override public LongCounter getLongCounter(String name) { throw new UnsupportedOperationException(“Long counters are not supported in rich async functions.”); } @Override public DoubleCounter getDoubleCounter(String name) { throw new UnsupportedOperationException(“Long counters are not supported in rich async functions.”); } @Override public Histogram getHistogram(String name) { throw new UnsupportedOperationException(“Histograms are not supported in rich async functions.”); } @Override public boolean hasBroadcastVariable(String name) { throw new UnsupportedOperationException(“Broadcast variables are not supported in rich async functions.”); } @Override public <RT> List<RT> getBroadcastVariable(String name) { throw new UnsupportedOperationException(“Broadcast variables are not supported in rich async functions.”); } @Override public <T, C> C getBroadcastVariableWithInitializer(String name, BroadcastVariableInitializer<T, C> initializer) { throw new UnsupportedOperationException(“Broadcast variables are not supported in rich async functions.”); } }RichAsyncFunctionRuntimeContext实现了RuntimeContext接口，它将一些方法代理给RuntimeContext，其余的Unsupported的方法都覆盖抛出UnsupportedOperationExceptionRichAsyncFunctionIterationRuntimeContextflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/async/RichAsyncFunction.java private static class RichAsyncFunctionIterationRuntimeContext extends RichAsyncFunctionRuntimeContext implements IterationRuntimeContext { private final IterationRuntimeContext iterationRuntimeContext; RichAsyncFunctionIterationRuntimeContext(IterationRuntimeContext iterationRuntimeContext) { super(iterationRuntimeContext); this.iterationRuntimeContext = Preconditions.checkNotNull(iterationRuntimeContext); } @Override public int getSuperstepNumber() { return iterationRuntimeContext.getSuperstepNumber(); } // ———————————————————————————– // Unsupported operations // ———————————————————————————– @Override public <T extends Aggregator<?>> T getIterationAggregator(String name) { throw new UnsupportedOperationException(“Iteration aggregators are not supported in rich async functions.”); } @Override public <T extends Value> T getPreviousIterationAggregate(String name) { throw new UnsupportedOperationException(“Iteration aggregators are not supported in rich async functions.”); } }RichAsyncFunctionIterationRuntimeContext继承了RichAsyncFunctionRuntimeContext，实现了IterationRuntimeContext接口，它将getSuperstepNumber方法交由IterationRuntimeContext处理，然后覆盖getIterationAggregator、getPreviousIterationAggregate方法抛出UnsupportedOperationExceptionAsyncDataStreamflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/AsyncDataStream.java@PublicEvolvingpublic class AsyncDataStream { /* * Output mode for asynchronous operations. */ public enum OutputMode { ORDERED, UNORDERED } private static final int DEFAULT_QUEUE_CAPACITY = 100; private static <IN, OUT> SingleOutputStreamOperator<OUT> addOperator( DataStream<IN> in, AsyncFunction<IN, OUT> func, long timeout, int bufSize, OutputMode mode) { TypeInformation<OUT> outTypeInfo = TypeExtractor.getUnaryOperatorReturnType( func, AsyncFunction.class, 0, 1, new int[]{1, 0}, in.getType(), Utils.getCallLocationName(), true); // create transform AsyncWaitOperator<IN, OUT> operator = new AsyncWaitOperator<>( in.getExecutionEnvironment().clean(func), timeout, bufSize, mode); return in.transform(“async wait operator”, outTypeInfo, operator); } public static <IN, OUT> SingleOutputStreamOperator<OUT> unorderedWait( DataStream<IN> in, AsyncFunction<IN, OUT> func, long timeout, TimeUnit timeUnit, int capacity) { return addOperator(in, func, timeUnit.toMillis(timeout), capacity, OutputMode.UNORDERED); } public static <IN, OUT> SingleOutputStreamOperator<OUT> unorderedWait( DataStream<IN> in, AsyncFunction<IN, OUT> func, long timeout, TimeUnit timeUnit) { return addOperator( in, func, timeUnit.toMillis(timeout), DEFAULT_QUEUE_CAPACITY, OutputMode.UNORDERED); } public static <IN, OUT> SingleOutputStreamOperator<OUT> orderedWait( DataStream<IN> in, AsyncFunction<IN, OUT> func, long timeout, TimeUnit timeUnit, int capacity) { return addOperator(in, func, timeUnit.toMillis(timeout), capacity, OutputMode.ORDERED); } public static <IN, OUT> SingleOutputStreamOperator<OUT> orderedWait( DataStream<IN> in, AsyncFunction<IN, OUT> func, long timeout, TimeUnit timeUnit) { return addOperator( in, func, timeUnit.toMillis(timeout), DEFAULT_QUEUE_CAPACITY, OutputMode.ORDERED); }}AsyncDataStream提供了unorderedWait、orderedWait两类方法来将AsyncFunction作用于DataStreamunorderedWait、orderedWait方法有带capacity参数的也有不带capacity参数的，不带capacity参数即默认使用DEFAULT_QUEUE_CAPACITY，即100；这些方法最后都是调用addOperator私有方法来实现，它使用的是AsyncWaitOperator；unorderedWait、orderedWait方法都带了timeout参数，用于指定等待async操作完成的超时时间AsyncDataStream提供了两种OutputMode，其中UNORDERED是无序的，即一旦async操作完成就emit结果，当使用TimeCharacteristic.ProcessingTime的时候这种模式延迟最低、负载最低；ORDERED是有序的，即按element的输入顺序emit结果，为了保证有序operator需要缓冲数据，因而会造成一定的延迟及负载小结flink给外部数据访问提供了Asynchronous I/O的API，用于提升streaming的吞吐量，其基本使用就是定义一个实现AsyncFunction接口的function，然后使用AsyncDataStream的unorderedWait或orderedWait方法将AsyncFunction作用到DataStream作为transformationAsyncFunction接口继承了Function，它定义了asyncInvoke方法以及一个default的timeout方法；asyncInvoke方法执行异步逻辑，然后通过ResultFuture.complete将结果或异常设置到ResultFuture，如果异常则通过ResultFuture.completeExceptionally(Throwable)来传递到ResultFuture；RichAsyncFunction继承了AbstractRichFunction，同时声明实现AsyncFunction接口，它不没有实现asyncInvoke，交由子类实现；它覆盖了setRuntimeContext方法，这里使用RichAsyncFunctionRuntimeContext或者RichAsyncFunctionIterationRuntimeContext进行包装AsyncDataStream的unorderedWait或orderedWait有两个关于async operation的参数，一个是timeout参数用于设置async的超时时间，一个是capacity参数用于指定同一时刻最大允许多少个(并发)async request在执行；AsyncDataStream提供了两种OutputMode，其中UNORDERED是无序的，即一旦async操作完成就emit结果，当使用TimeCharacteristic.ProcessingTime的时候这种模式延迟最低、负载最低；ORDERED是有序的，即按element的输入顺序emit结果，为了保证有序operator需要缓冲数据，因而会造成一定的延迟及负载docAsynchronous I/O for External Data Access ...

聊聊flink的InternalTimeServiceManager

序本文主要研究一下flink的InternalTimeServiceManagerInternalTimeServiceManagerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/InternalTimeServiceManager.java@Internalpublic class InternalTimeServiceManager<K> { @VisibleForTesting static final String TIMER_STATE_PREFIX = “timer_state”; @VisibleForTesting static final String PROCESSING_TIMER_PREFIX = TIMER_STATE_PREFIX + “/processing”; @VisibleForTesting static final String EVENT_TIMER_PREFIX = TIMER_STATE_PREFIX + “/event_”; private final KeyGroupRange localKeyGroupRange; private final KeyContext keyContext; private final PriorityQueueSetFactory priorityQueueSetFactory; private final ProcessingTimeService processingTimeService; private final Map<String, InternalTimerServiceImpl<K, ?>> timerServices; private final boolean useLegacySynchronousSnapshots; InternalTimeServiceManager( KeyGroupRange localKeyGroupRange, KeyContext keyContext, PriorityQueueSetFactory priorityQueueSetFactory, ProcessingTimeService processingTimeService, boolean useLegacySynchronousSnapshots) { this.localKeyGroupRange = Preconditions.checkNotNull(localKeyGroupRange); this.priorityQueueSetFactory = Preconditions.checkNotNull(priorityQueueSetFactory); this.keyContext = Preconditions.checkNotNull(keyContext); this.processingTimeService = Preconditions.checkNotNull(processingTimeService); this.useLegacySynchronousSnapshots = useLegacySynchronousSnapshots; this.timerServices = new HashMap<>(); } @SuppressWarnings(“unchecked”) public <N> InternalTimerService<N> getInternalTimerService( String name, TimerSerializer<K, N> timerSerializer, Triggerable<K, N> triggerable) { InternalTimerServiceImpl<K, N> timerService = registerOrGetTimerService(name, timerSerializer); timerService.startTimerService( timerSerializer.getKeySerializer(), timerSerializer.getNamespaceSerializer(), triggerable); return timerService; } @SuppressWarnings(“unchecked”) <N> InternalTimerServiceImpl<K, N> registerOrGetTimerService(String name, TimerSerializer<K, N> timerSerializer) { InternalTimerServiceImpl<K, N> timerService = (InternalTimerServiceImpl<K, N>) timerServices.get(name); if (timerService == null) { timerService = new InternalTimerServiceImpl<>( localKeyGroupRange, keyContext, processingTimeService, createTimerPriorityQueue(PROCESSING_TIMER_PREFIX + name, timerSerializer), createTimerPriorityQueue(EVENT_TIMER_PREFIX + name, timerSerializer)); timerServices.put(name, timerService); } return timerService; } Map<String, InternalTimerServiceImpl<K, ?>> getRegisteredTimerServices() { return Collections.unmodifiableMap(timerServices); } private <N> KeyGroupedInternalPriorityQueue<TimerHeapInternalTimer<K, N>> createTimerPriorityQueue( String name, TimerSerializer<K, N> timerSerializer) { return priorityQueueSetFactory.create( name, timerSerializer); } public void advanceWatermark(Watermark watermark) throws Exception { for (InternalTimerServiceImpl<?, ?> service : timerServices.values()) { service.advanceWatermark(watermark.getTimestamp()); } } ////////////////// Fault Tolerance Methods /////////////////// public void snapshotStateForKeyGroup(DataOutputView stream, int keyGroupIdx) throws IOException { Preconditions.checkState(useLegacySynchronousSnapshots); InternalTimerServiceSerializationProxy<K> serializationProxy = new InternalTimerServiceSerializationProxy<>(this, keyGroupIdx); serializationProxy.write(stream); } public void restoreStateForKeyGroup( InputStream stream, int keyGroupIdx, ClassLoader userCodeClassLoader) throws IOException { InternalTimerServiceSerializationProxy<K> serializationProxy = new InternalTimerServiceSerializationProxy<>( this, userCodeClassLoader, keyGroupIdx); serializationProxy.read(stream); } //////////////////// Methods used ONLY IN TESTS //////////////////// @VisibleForTesting public int numProcessingTimeTimers() { int count = 0; for (InternalTimerServiceImpl<?, ?> timerService : timerServices.values()) { count += timerService.numProcessingTimeTimers(); } return count; } @VisibleForTesting public int numEventTimeTimers() { int count = 0; for (InternalTimerServiceImpl<?, ?> timerService : timerServices.values()) { count += timerService.numEventTimeTimers(); } return count; }}InternalTimeServiceManager用于管理所有keyed operators要使用的timerService，它在内存使用map维护了timerService的名称与InternalTimerServiceImpl的映射getInternalTimerService方法首先调用registerOrGetTimerService方法获取或创建指定name的InternalTimerServiceImpl，之后调用timerService.startTimerService进行初始化然后返回registerOrGetTimerService方法先从名为timerServices的map中查找指定name的InternalTimerServiceImpl，没有就创建一个，然后放入到名为timerServices的map中；创建InternalTimerServiceImpl的时候，这里使用createTimerPriorityQueue来创建KeyGroupedInternalPriorityQueue类型的processingTimeTimersQueue及eventTimeTimersQueue；createTimerPriorityQueue是通过priorityQueueSetFactory来创建的PriorityQueueSetFactoryflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/PriorityQueueSetFactory.javapublic interface PriorityQueueSetFactory { @Nonnull <T extends HeapPriorityQueueElement & PriorityComparable & Keyed> KeyGroupedInternalPriorityQueue<T> create( @Nonnull String stateName, @Nonnull TypeSerializer<T> byteOrderedElementSerializer);}PriorityQueueSetFactory定义了create方法，创建的是KeyGroupedInternalPriorityQueue，其中T的泛型要求是同时继承或实现HeapPriorityQueueElement、PriorityComparable、Keyed这三个接口HeapPriorityQueueElementflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/heap/HeapPriorityQueueElement.java@Internalpublic interface HeapPriorityQueueElement { /** * The index that indicates that a {@link HeapPriorityQueueElement} object is not contained in and managed by any * {@link HeapPriorityQueue}. We do not strictly enforce that internal indexes must be reset to this value when * elements are removed from a {@link HeapPriorityQueue}. / int NOT_CONTAINED = Integer.MIN_VALUE; /* * Returns the current index of this object in the internal array of {@link HeapPriorityQueue}. / int getInternalIndex(); /* * Sets the current index of this object in the {@link HeapPriorityQueue} and should only be called by the owning * {@link HeapPriorityQueue}. * * @param newIndex the new index in the timer heap. / void setInternalIndex(int newIndex);}HeapPriorityQueueElement接口定义了HeapPriorityQueue所要求的元素类型，它定义了getInternalIndex、setInternalIndex方法PriorityComparableflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/PriorityComparable.javapublic interface PriorityComparable<T> { int comparePriorityTo(@Nonnull T other);}PriorityComparable定义了comparePriorityTo方法，用于根据priority来进行比对Keyedflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/Keyed.javapublic interface Keyed<K> { K getKey();}Keyed接口定义了getKey方法，用于返回该对象的keyInternalTimerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/InternalTimer.java@Internalpublic interface InternalTimer<K, N> extends PriorityComparable<InternalTimer<?, ?>>, Keyed<K> { /* Function to extract the key from a {@link InternalTimer}. / KeyExtractorFunction<InternalTimer<?, ?>> KEY_EXTRACTOR_FUNCTION = InternalTimer::getKey; /* Function to compare instances of {@link InternalTimer}. / PriorityComparator<InternalTimer<?, ?>> TIMER_COMPARATOR = (left, right) -> Long.compare(left.getTimestamp(), right.getTimestamp()); /* * Returns the timestamp of the timer. This value determines the point in time when the timer will fire. / long getTimestamp(); /* * Returns the key that is bound to this timer. / @Nonnull @Override K getKey(); /* * Returns the namespace that is bound to this timer. / @Nonnull N getNamespace();}InternalTimer继承了PriorityComparable、Keyed接口，它定义了getTimestamp、getKey、getNamespace方法，同时内置了KEY_EXTRACTOR_FUNCTION、TIMER_COMPARATORTimerHeapInternalTimerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/TimerHeapInternalTimer.java@Internalpublic final class TimerHeapInternalTimer<K, N> implements InternalTimer<K, N>, HeapPriorityQueueElement { /* The key for which the timer is scoped. / @Nonnull private final K key; /* The namespace for which the timer is scoped. / @Nonnull private final N namespace; /* The expiration timestamp. */ private final long timestamp; private transient int timerHeapIndex; TimerHeapInternalTimer(long timestamp, @Nonnull K key, @Nonnull N namespace) { this.timestamp = timestamp; this.key = key; this.namespace = namespace; this.timerHeapIndex = NOT_CONTAINED; } @Override public long getTimestamp() { return timestamp; } @Nonnull @Override public K getKey() { return key; } @Nonnull @Override public N getNamespace() { return namespace; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o instanceof InternalTimer) { InternalTimer<?, ?> timer = (InternalTimer<?, ?>) o; return timestamp == timer.getTimestamp() && key.equals(timer.getKey()) && namespace.equals(timer.getNamespace()); } return false; } @Override public int getInternalIndex() { return timerHeapIndex; } @Override public void setInternalIndex(int newIndex) { this.timerHeapIndex = newIndex; } void removedFromTimerQueue() { setInternalIndex(NOT_CONTAINED); } @Override public int hashCode() { int result = (int) (timestamp ^ (timestamp >>> 32)); result = 31 * result + key.hashCode(); result = 31 * result + namespace.hashCode(); return result; } @Override public String toString() { return “Timer{” + “timestamp=” + timestamp + “, key=” + key + “, namespace=” + namespace + ‘}’; } @Override public int comparePriorityTo(@Nonnull InternalTimer<?, ?> other) { return Long.compare(timestamp, other.getTimestamp()); }}TimerHeapInternalTimer实现了InternalTimer及HeapPriorityQueueElement接口；这里removedFromTimerQueue接口是调用setInternalIndex(NOT_CONTAINED)，即改动其index为NOT_CONTAINED，逻辑删除HeapPriorityQueueSetFactoryflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/heap/HeapPriorityQueueSetFactory.javapublic class HeapPriorityQueueSetFactory implements PriorityQueueSetFactory { @Nonnull private final KeyGroupRange keyGroupRange; @Nonnegative private final int totalKeyGroups; @Nonnegative private final int minimumCapacity; public HeapPriorityQueueSetFactory( @Nonnull KeyGroupRange keyGroupRange, @Nonnegative int totalKeyGroups, @Nonnegative int minimumCapacity) { this.keyGroupRange = keyGroupRange; this.totalKeyGroups = totalKeyGroups; this.minimumCapacity = minimumCapacity; } @Nonnull @Override public <T extends HeapPriorityQueueElement & PriorityComparable & Keyed> HeapPriorityQueueSet<T> create( @Nonnull String stateName, @Nonnull TypeSerializer<T> byteOrderedElementSerializer) { return new HeapPriorityQueueSet<>( PriorityComparator.forPriorityComparableObjects(), KeyExtractorFunction.forKeyedObjects(), minimumCapacity, keyGroupRange, totalKeyGroups); }}HeapPriorityQueueSetFactory实现了PriorityQueueSetFactory接口，其create方法创建的是HeapPriorityQueueSet小结InternalTimeServiceManager用于管理所有keyed operators要使用的timerService，它在内存使用map维护了timerService的名称与InternalTimerServiceImpl的映射；getInternalTimerService方法首先调用registerOrGetTimerService方法获取或创建指定name的InternalTimerServiceImpl，之后调用timerService.startTimerService进行初始化然后返回registerOrGetTimerService方法先从名为timerServices的map中查找指定name的InternalTimerServiceImpl，没有就创建一个，然后放入到名为timerServices的map中；创建InternalTimerServiceImpl的时候，这里使用createTimerPriorityQueue来创建KeyGroupedInternalPriorityQueue类型的processingTimeTimersQueue及eventTimeTimersQueue；createTimerPriorityQueue是通过priorityQueueSetFactory来创建的PriorityQueueSetFactory定义了create方法，创建的是KeyGroupedInternalPriorityQueue，其中T的泛型要求是同时继承或实现HeapPriorityQueueElement、PriorityComparable、Keyed这三个接口(InternalTimer继承了PriorityComparable、Keyed接口，TimerHeapInternalTimer实现了InternalTimer及HeapPriorityQueueElement接口)；HeapPriorityQueueSetFactory实现了PriorityQueueSetFactory接口，其create方法创建的是HeapPriorityQueueSetdocInternalTimeServiceManager ...

《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch

前言前面 FLink 的文章中我们已经介绍了说 Flink 已经有很多自带的 Connector。1、[《从0到1学习Flink》—— Data Source 介绍](http://www.54tianzhisheng.cn/...2、《从0到1学习Flink》—— Data Sink 介绍其中包括了 Source 和 Sink 的，后面我也讲了下如何自定义自己的 Source 和 Sink。那么今天要做的事情是啥呢？就是介绍一下 Flink 自带的 ElasticSearch Connector，我们今天就用他来做 Sink，将 Kafka 中的数据经过 Flink 处理后然后存储到 ElasticSearch。准备安装 ElasticSearch，这里就忽略，自己找我以前的文章，建议安装 ElasticSearch 6.0 版本以上的，毕竟要跟上时代的节奏。下面就讲解一下生产环境中如何使用 Elasticsearch Sink 以及一些注意点，及其内部实现机制。Elasticsearch Sink添加依赖<dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-elasticsearch6_${scala.binary.version}</artifactId> <version>${flink.version}</version></dependency>上面这依赖版本号请自己根据使用的版本对应改变下。下面所有的代码都没有把 import 引入到这里来，如果需要查看更详细的代码，请查看我的 GitHub 仓库地址：https://github.com/zhisheng17/flink-learning/tree/master/flink-learning-connectors/flink-learning-connectors-es6这个 module 含有本文的所有代码实现，当然越写到后面自己可能会做一些抽象，所以如果有代码改变很正常，请直接查看全部项目代码。ElasticSearchSinkUtil 工具类这个工具类是自己封装的，getEsAddresses 方法将传入的配置文件 es 地址解析出来，可以是域名方式，也可以是 ip + port 形式。addSink 方法是利用了 Flink 自带的 ElasticsearchSink 来封装了一层，传入了一些必要的调优参数和 es 配置参数，下面文章还会再讲些其他的配置。ElasticSearchSinkUtil.javapublic class ElasticSearchSinkUtil { /** * es sink * * @param hosts es hosts * @param bulkFlushMaxActions bulk flush size * @param parallelism 并行数 * @param data 数据 * @param func * @param <T> / public static <T> void addSink(List<HttpHost> hosts, int bulkFlushMaxActions, int parallelism, SingleOutputStreamOperator<T> data, ElasticsearchSinkFunction<T> func) { ElasticsearchSink.Builder<T> esSinkBuilder = new ElasticsearchSink.Builder<>(hosts, func); esSinkBuilder.setBulkFlushMaxActions(bulkFlushMaxActions); data.addSink(esSinkBuilder.build()).setParallelism(parallelism); } /* * 解析配置文件的 es hosts * * @param hosts * @return * @throws MalformedURLException */ public static List<HttpHost> getEsAddresses(String hosts) throws MalformedURLException { String[] hostList = hosts.split(","); List<HttpHost> addresses = new ArrayList<>(); for (String host : hostList) { if (host.startsWith(“http”)) { URL url = new URL(host); addresses.add(new HttpHost(url.getHost(), url.getPort())); } else { String[] parts = host.split(":", 2); if (parts.length > 1) { addresses.add(new HttpHost(parts[0], Integer.parseInt(parts[1]))); } else { throw new MalformedURLException(“invalid elasticsearch hosts format”); } } } return addresses; }}Main 启动类Main.javapublic class Main { public static void main(String[] args) throws Exception { //获取所有参数 final ParameterTool parameterTool = ExecutionEnvUtil.createParameterTool(args); //准备好环境 StreamExecutionEnvironment env = ExecutionEnvUtil.prepare(parameterTool); //从kafka读取数据 DataStreamSource<Metrics> data = KafkaConfigUtil.buildSource(env); //从配置文件中读取 es 的地址 List<HttpHost> esAddresses = ElasticSearchSinkUtil.getEsAddresses(parameterTool.get(ELASTICSEARCH_HOSTS)); //从配置文件中读取 bulk flush size，代表一次批处理的数量，这个可是性能调优参数，特别提醒 int bulkSize = parameterTool.getInt(ELASTICSEARCH_BULK_FLUSH_MAX_ACTIONS, 40); //从配置文件中读取并行 sink 数，这个也是性能调优参数，特别提醒，这样才能够更快的消费，防止 kafka 数据堆积 int sinkParallelism = parameterTool.getInt(STREAM_SINK_PARALLELISM, 5); //自己再自带的 es sink 上一层封装了下 ElasticSearchSinkUtil.addSink(esAddresses, bulkSize, sinkParallelism, data, (Metrics metric, RuntimeContext runtimeContext, RequestIndexer requestIndexer) -> { requestIndexer.add(Requests.indexRequest() .index(ZHISHENG + “_” + metric.getName()) //es 索引名 .type(ZHISHENG) //es type .source(GsonUtil.toJSONBytes(metric), XContentType.JSON)); }); env.execute(“flink learning connectors es6”); }}配置文件配置都支持集群模式填写，注意用 , 分隔！kafka.brokers=localhost:9092kafka.group.id=zhisheng-metrics-group-testkafka.zookeeper.connect=localhost:2181metrics.topic=zhisheng-metricsstream.parallelism=5stream.checkpoint.interval=1000stream.checkpoint.enable=falseelasticsearch.hosts=localhost:9200elasticsearch.bulk.flush.max.actions=40stream.sink.parallelism=5运行结果执行 Main 类的 main 方法，我们的程序是只打印 flink 的日志，没有打印存入的日志（因为我们这里没有打日志）：所以看起来不知道我们的 sink 是否有用，数据是否从 kafka 读取出来后存入到 es 了。你可以查看下本地起的 es 终端或者服务器的 es 日志就可以看到效果了。es 日志如下：上图是我本地 Mac 电脑终端的 es 日志，可以看到我们的索引了。如果还不放心，你也可以在你的电脑装个 kibana，然后更加的直观查看下 es 的索引情况（或者直接敲 es 的命令）我们用 kibana 查看存入 es 的索引如下：程序执行了一会，存入 es 的数据量就很大了。扩展配置上面代码已经可以实现你的大部分场景了，但是如果你的业务场景需要保证数据的完整性（不能出现丢数据的情况），那么就需要添加一些重试策略，因为在我们的生产环境中，很有可能会因为某些组件不稳定性导致各种问题，所以这里我们就要在数据存入失败的时候做重试操作，这里 flink 自带的 es sink 就支持了，常用的失败重试配置有:1、bulk.flush.backoff.enable 用来表示是否开启重试机制2、bulk.flush.backoff.type 重试策略，有两种：EXPONENTIAL 指数型（表示多次重试之间的时间间隔按照指数方式进行增长）、CONSTANT 常数型（表示多次重试之间的时间间隔为固定常数）3、bulk.flush.backoff.delay 进行重试的时间间隔4、bulk.flush.backoff.retries 失败重试的次数5、bulk.flush.max.actions: 批量写入时的最大写入条数6、bulk.flush.max.size.mb: 批量写入时的最大数据量7、bulk.flush.interval.ms: 批量写入的时间间隔，配置后则会按照该时间间隔严格执行，无视上面的两个批量写入配置看下啦，就是如下这些配置了，如果你需要的话，可以在这个地方配置扩充了。FailureHandler 失败处理器写入 ES 的时候会有这些情况会导致写入 ES 失败：1、ES 集群队列满了，报如下错误12:08:07.326 [I/O dispatcher 13] ERROR o.a.f.s.c.e.ElasticsearchSinkBase - Failed Elasticsearch item request: ElasticsearchException[Elasticsearch exception [type=es_rejected_execution_exception, reason=rejected execution of org.elasticsearch.transport.TransportService$7@566c9379 on EsThreadPoolExecutor[name = node-1/write, queue capacity = 200, org.elasticsearch.common.util.concurrent.EsThreadPoolExecutor@f00b373[Running, pool size = 4, active threads = 4, queued tasks = 200, completed tasks = 6277]]]]是这样的，我电脑安装的 es 队列容量默认应该是 200，我没有修改过。我这里如果配置的 bulk flush size * 并发 sink 数量这个值如果大于这个 queue capacity ，那么就很容易导致出现这种因为 es 队列满了而写入失败。当然这里你也可以通过调大点 es 的队列。参考：https://www.elastic.co/guide/…2、ES 集群某个节点挂了这个就不用说了，肯定写入失败的。跟过源码可以发现 RestClient 类里的 performRequestAsync 方法一开始会随机的从集群中的某个节点进行写入数据，如果这台机器掉线，会进行重试在其他的机器上写入，那么当时写入的这台机器的请求就需要进行失败重试，否则就会把数据丢失！3、ES 集群某个节点的磁盘满了这里说的磁盘满了，并不是磁盘真的就没有一点剩余空间的，是 es 会在写入的时候检查磁盘的使用情况，在 85% 的时候会打印日志警告。这里我看了下源码如下图：如果你想继续让 es 写入的话就需要去重新配一下 es 让它继续写入，或者你也可以清空些不必要的数据腾出磁盘空间来。解决方法DataStream<String> input = …;input.addSink(new ElasticsearchSink<>( config, transportAddresses, new ElasticsearchSinkFunction<String>() {…}, new ActionRequestFailureHandler() { @Override void onFailure(ActionRequest action, Throwable failure, int restStatusCode, RequestIndexer indexer) throw Throwable { if (ExceptionUtils.containsThrowable(failure, EsRejectedExecutionException.class)) { // full queue; re-add document for indexing indexer.add(action); } else if (ExceptionUtils.containsThrowable(failure, ElasticsearchParseException.class)) { // malformed document; simply drop request without failing sink } else { // for all other failures, fail the sink // here the failure is simply rethrown, but users can also choose to throw custom exceptions throw failure; } }}));如果仅仅只是想做失败重试，也可以直接使用官方提供的默认的 RetryRejectedExecutionFailureHandler ，该处理器会对 EsRejectedExecutionException 导致到失败写入做重试处理。如果你没有设置失败处理器(failure handler)，那么就会使用默认的 NoOpFailureHandler 来简单处理所有的异常。总结本文写了 Flink connector es，将 Kafka 中的数据读取并存储到 ElasticSearch 中，文中讲了如何封装自带的 sink，然后一些扩展配置以及 FailureHandler 情况下要怎么处理。（这个问题可是线上很容易遇到的）关注我转载请务必注明原创地址为：http://www.54tianzhisheng.cn/2018/12/30/Flink-ElasticSearch-Sink/微信公众号：zhisheng另外我自己整理了些 Flink 的学习资料，目前已经全部放到微信公众号了。你可以加我的微信：zhisheng_tian，然后回复关键字：Flink 即可无条件获取到。Github 代码仓库https://github.com/zhisheng17/flink-learning/以后这个项目的所有代码都将放在这个仓库里，包含了自己学习 flink 的一些 demo 和博客相关文章1、《从0到1学习Flink》—— Apache Flink 介绍2、《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门3、《从0到1学习Flink》—— Flink 配置文件详解4、《从0到1学习Flink》—— Data Source 介绍5、《从0到1学习Flink》—— 如何自定义 Data Source ？6、《从0到1学习Flink》—— Data Sink 介绍7、《从0到1学习Flink》—— 如何自定义 Data Sink ？8、《从0到1学习Flink》—— Flink Data transformation(转换)9、《从0到1学习Flink》—— 介绍Flink中的Stream Windows10、《从0到1学习Flink》—— Flink 中的几种 Time 详解11、《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch12、《从0到1学习Flink》—— Flink 项目如何运行？13、《从0到1学习Flink》—— Flink 写入数据到 Kafka ...

聊聊flink的TimerService

序本文主要研究一下flink的TimerServiceTimerServiceflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/TimerService.java@PublicEvolvingpublic interface TimerService { String UNSUPPORTED_REGISTER_TIMER_MSG = “Setting timers is only supported on a keyed streams.”; String UNSUPPORTED_DELETE_TIMER_MSG = “Deleting timers is only supported on a keyed streams.”; long currentProcessingTime(); long currentWatermark(); void registerProcessingTimeTimer(long time); void registerEventTimeTimer(long time); void deleteProcessingTimeTimer(long time); void deleteEventTimeTimer(long time);}TimerService接口定义了currentProcessingTime、currentWatermark、registerProcessingTimeTimer、registerEventTimeTimer、deleteProcessingTimeTimer、deleteEventTimeTimer接口SimpleTimerServiceflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/SimpleTimerService.java@Internalpublic class SimpleTimerService implements TimerService { private final InternalTimerService<VoidNamespace> internalTimerService; public SimpleTimerService(InternalTimerService<VoidNamespace> internalTimerService) { this.internalTimerService = internalTimerService; } @Override public long currentProcessingTime() { return internalTimerService.currentProcessingTime(); } @Override public long currentWatermark() { return internalTimerService.currentWatermark(); } @Override public void registerProcessingTimeTimer(long time) { internalTimerService.registerProcessingTimeTimer(VoidNamespace.INSTANCE, time); } @Override public void registerEventTimeTimer(long time) { internalTimerService.registerEventTimeTimer(VoidNamespace.INSTANCE, time); } @Override public void deleteProcessingTimeTimer(long time) { internalTimerService.deleteProcessingTimeTimer(VoidNamespace.INSTANCE, time); } @Override public void deleteEventTimeTimer(long time) { internalTimerService.deleteEventTimeTimer(VoidNamespace.INSTANCE, time); }}SimpleTimerService实现了TimerService，它是委托InternalTimerService来实现InternalTimerServiceflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/InternalTimerService.java@Internalpublic interface InternalTimerService<N> { long currentProcessingTime(); long currentWatermark(); void registerProcessingTimeTimer(N namespace, long time); void deleteProcessingTimeTimer(N namespace, long time); void registerEventTimeTimer(N namespace, long time); void deleteEventTimeTimer(N namespace, long time);}InternalTimerService是TimerService的internal版本的接口，比起TimerService它定义了namespace，在registerProcessingTimeTimer、deleteProcessingTimeTimer、registerEventTimeTimer、deleteEventTimeTimer的方法中均多了一个namesapce的参数InternalTimerServiceImplflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/InternalTimerServiceImpl.javapublic class InternalTimerServiceImpl<K, N> implements InternalTimerService<N>, ProcessingTimeCallback { private final ProcessingTimeService processingTimeService; private final KeyContext keyContext; private final KeyGroupedInternalPriorityQueue<TimerHeapInternalTimer<K, N>> processingTimeTimersQueue; private final KeyGroupedInternalPriorityQueue<TimerHeapInternalTimer<K, N>> eventTimeTimersQueue; private final KeyGroupRange localKeyGroupRange; private final int localKeyGroupRangeStartIdx; private long currentWatermark = Long.MIN_VALUE; private ScheduledFuture<?> nextTimer; // Variables to be set when the service is started. private TypeSerializer<K> keySerializer; private TypeSerializer<N> namespaceSerializer; private Triggerable<K, N> triggerTarget; private volatile boolean isInitialized; private TypeSerializer<K> keyDeserializer; private TypeSerializer<N> namespaceDeserializer; private InternalTimersSnapshot<K, N> restoredTimersSnapshot; InternalTimerServiceImpl( KeyGroupRange localKeyGroupRange, KeyContext keyContext, ProcessingTimeService processingTimeService, KeyGroupedInternalPriorityQueue<TimerHeapInternalTimer<K, N>> processingTimeTimersQueue, KeyGroupedInternalPriorityQueue<TimerHeapInternalTimer<K, N>> eventTimeTimersQueue) { this.keyContext = checkNotNull(keyContext); this.processingTimeService = checkNotNull(processingTimeService); this.localKeyGroupRange = checkNotNull(localKeyGroupRange); this.processingTimeTimersQueue = checkNotNull(processingTimeTimersQueue); this.eventTimeTimersQueue = checkNotNull(eventTimeTimersQueue); // find the starting index of the local key-group range int startIdx = Integer.MAX_VALUE; for (Integer keyGroupIdx : localKeyGroupRange) { startIdx = Math.min(keyGroupIdx, startIdx); } this.localKeyGroupRangeStartIdx = startIdx; } public void startTimerService( TypeSerializer<K> keySerializer, TypeSerializer<N> namespaceSerializer, Triggerable<K, N> triggerTarget) { if (!isInitialized) { if (keySerializer == null || namespaceSerializer == null) { throw new IllegalArgumentException(“The TimersService serializers cannot be null.”); } if (this.keySerializer != null || this.namespaceSerializer != null || this.triggerTarget != null) { throw new IllegalStateException(“The TimerService has already been initialized.”); } // the following is the case where we restore if (restoredTimersSnapshot != null) { CompatibilityResult<K> keySerializerCompatibility = CompatibilityUtil.resolveCompatibilityResult( this.keyDeserializer, null, restoredTimersSnapshot.getKeySerializerConfigSnapshot(), keySerializer); CompatibilityResult<N> namespaceSerializerCompatibility = CompatibilityUtil.resolveCompatibilityResult( this.namespaceDeserializer, null, restoredTimersSnapshot.getNamespaceSerializerConfigSnapshot(), namespaceSerializer); if (keySerializerCompatibility.isRequiresMigration() || namespaceSerializerCompatibility.isRequiresMigration()) { throw new IllegalStateException(“Tried to initialize restored TimerService " + “with incompatible serializers than those used to snapshot its state.”); } } this.keySerializer = keySerializer; this.namespaceSerializer = namespaceSerializer; this.keyDeserializer = null; this.namespaceDeserializer = null; this.triggerTarget = Preconditions.checkNotNull(triggerTarget); // re-register the restored timers (if any) final InternalTimer<K, N> headTimer = processingTimeTimersQueue.peek(); if (headTimer != null) { nextTimer = processingTimeService.registerTimer(headTimer.getTimestamp(), this); } this.isInitialized = true; } else { if (!(this.keySerializer.equals(keySerializer) && this.namespaceSerializer.equals(namespaceSerializer))) { throw new IllegalArgumentException(“Already initialized Timer Service " + “tried to be initialized with different key and namespace serializers.”); } } } @Override public long currentProcessingTime() { return processingTimeService.getCurrentProcessingTime(); } @Override public long currentWatermark() { return currentWatermark; } @Override public void registerProcessingTimeTimer(N namespace, long time) { InternalTimer<K, N> oldHead = processingTimeTimersQueue.peek(); if (processingTimeTimersQueue.add(new TimerHeapInternalTimer<>(time, (K) keyContext.getCurrentKey(), namespace))) { long nextTriggerTime = oldHead != null ? oldHead.getTimestamp() : Long.MAX_VALUE; // check if we need to re-schedule our timer to earlier if (time < nextTriggerTime) { if (nextTimer != null) { nextTimer.cancel(false); } nextTimer = processingTimeService.registerTimer(time, this); } } } @Override public void registerEventTimeTimer(N namespace, long time) { eventTimeTimersQueue.add(new TimerHeapInternalTimer<>(time, (K) keyContext.getCurrentKey(), namespace)); } @Override public void deleteProcessingTimeTimer(N namespace, long time) { processingTimeTimersQueue.remove(new TimerHeapInternalTimer<>(time, (K) keyContext.getCurrentKey(), namespace)); } @Override public void deleteEventTimeTimer(N namespace, long time) { eventTimeTimersQueue.remove(new TimerHeapInternalTimer<>(time, (K) keyContext.getCurrentKey(), namespace)); } @Override public void onProcessingTime(long time) throws Exception { // null out the timer in case the Triggerable calls registerProcessingTimeTimer() // inside the callback. nextTimer = null; InternalTimer<K, N> timer; while ((timer = processingTimeTimersQueue.peek()) != null && timer.getTimestamp() <= time) { processingTimeTimersQueue.poll(); keyContext.setCurrentKey(timer.getKey()); triggerTarget.onProcessingTime(timer); } if (timer != null && nextTimer == null) { nextTimer = processingTimeService.registerTimer(timer.getTimestamp(), this); } } public void advanceWatermark(long time) throws Exception { currentWatermark = time; InternalTimer<K, N> timer; while ((timer = eventTimeTimersQueue.peek()) != null && timer.getTimestamp() <= time) { eventTimeTimersQueue.poll(); keyContext.setCurrentKey(timer.getKey()); triggerTarget.onEventTime(timer); } } public InternalTimersSnapshot<K, N> snapshotTimersForKeyGroup(int keyGroupIdx) { return new InternalTimersSnapshot<>( keySerializer, keySerializer.snapshotConfiguration(), namespaceSerializer, namespaceSerializer.snapshotConfiguration(), eventTimeTimersQueue.getSubsetForKeyGroup(keyGroupIdx), processingTimeTimersQueue.getSubsetForKeyGroup(keyGroupIdx)); } @SuppressWarnings(“unchecked”) public void restoreTimersForKeyGroup(InternalTimersSnapshot<?, ?> restoredSnapshot, int keyGroupIdx) { this.restoredTimersSnapshot = (InternalTimersSnapshot<K, N>) restoredSnapshot; if (areSnapshotSerializersIncompatible(restoredSnapshot)) { throw new IllegalArgumentException(“Tried to restore timers " + “for the same service with different serializers.”); } this.keyDeserializer = restoredTimersSnapshot.getKeySerializer(); this.namespaceDeserializer = restoredTimersSnapshot.getNamespaceSerializer(); checkArgument(localKeyGroupRange.contains(keyGroupIdx), “Key Group " + keyGroupIdx + " does not belong to the local range.”); // restore the event time timers eventTimeTimersQueue.addAll(restoredTimersSnapshot.getEventTimeTimers()); // restore the processing time timers processingTimeTimersQueue.addAll(restoredTimersSnapshot.getProcessingTimeTimers()); } @VisibleForTesting public int numProcessingTimeTimers() { return this.processingTimeTimersQueue.size(); } @VisibleForTesting public int numEventTimeTimers() { return this.eventTimeTimersQueue.size(); } @VisibleForTesting public int numProcessingTimeTimers(N namespace) { return countTimersInNamespaceInternal(namespace, processingTimeTimersQueue); } @VisibleForTesting public int numEventTimeTimers(N namespace) { return countTimersInNamespaceInternal(namespace, eventTimeTimersQueue); } private int countTimersInNamespaceInternal(N namespace, InternalPriorityQueue<TimerHeapInternalTimer<K, N>> queue) { int count = 0; try (final CloseableIterator<TimerHeapInternalTimer<K, N>> iterator = queue.iterator()) { while (iterator.hasNext()) { final TimerHeapInternalTimer<K, N> timer = iterator.next(); if (timer.getNamespace().equals(namespace)) { count++; } } } catch (Exception e) { throw new FlinkRuntimeException(“Exception when closing iterator.”, e); } return count; } @VisibleForTesting int getLocalKeyGroupRangeStartIdx() { return this.localKeyGroupRangeStartIdx; } @VisibleForTesting List<Set<TimerHeapInternalTimer<K, N>>> getEventTimeTimersPerKeyGroup() { return partitionElementsByKeyGroup(eventTimeTimersQueue); } @VisibleForTesting List<Set<TimerHeapInternalTimer<K, N>>> getProcessingTimeTimersPerKeyGroup() { return partitionElementsByKeyGroup(processingTimeTimersQueue); } private <T> List<Set<T>> partitionElementsByKeyGroup(KeyGroupedInternalPriorityQueue<T> keyGroupedQueue) { List<Set<T>> result = new ArrayList<>(localKeyGroupRange.getNumberOfKeyGroups()); for (int keyGroup : localKeyGroupRange) { result.add(Collections.unmodifiableSet(keyGroupedQueue.getSubsetForKeyGroup(keyGroup))); } return result; } private boolean areSnapshotSerializersIncompatible(InternalTimersSnapshot<?, ?> restoredSnapshot) { return (this.keyDeserializer != null && !this.keyDeserializer.equals(restoredSnapshot.getKeySerializer())) || (this.namespaceDeserializer != null && !this.namespaceDeserializer.equals(restoredSnapshot.getNamespaceSerializer())); }}InternalTimerServiceImpl实现了InternalTimerService及ProcessingTimeCallback(定义了onProcessingTime方法)接口startTimerService方法主要是初始化keySerializer、namespaceSerializer、triggerTarget属性；registerEventTimeTimer及deleteEventTimeTimer方法使用的是eventTimeTimersQueue；registerProcessingTimeTimer及deleteProcessingTimeTimer方法使用的是processingTimeTimersQueue(eventTimeTimersQueue及processingTimeTimersQueue的类型为KeyGroupedInternalPriorityQueue，queue的元素类型为TimerHeapInternalTimer)eventTimerTimer的触发主要是在advanceWatermark方法中(AbstractStreamOperator的processWatermark方法会调用InternalTimeServiceManager的advanceWatermark方法，而该方法调用的是InternalTimerServiceImpl的advanceWatermark方法)，它会移除timestamp小于等于指定time的eventTimerTimer，然后回调triggerTarget.onEventTime方法；而processingTimeTimer的触发则是在onProcessingTime方法中(SystemProcessingTimeService的TriggerTask及RepeatedTriggerTask的定时任务会回调ProcessingTimeCallback的onProcessingTime方法)，它会移除timestamp小于等于指定time的processingTimeTimer，然后回调triggerTarget.onProcessingTime方法Triggerableflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/Triggerable.java@Internalpublic interface Triggerable<K, N> { /** * Invoked when an event-time timer fires. / void onEventTime(InternalTimer<K, N> timer) throws Exception; /* * Invoked when a processing-time timer fires. */ void onProcessingTime(InternalTimer<K, N> timer) throws Exception;}Triggerable接口定义了InternalTimerService会调用的onEventTime及onProcessingTime方法；WindowOperator、IntervalJoinOperator、KeyedProcessOperator、KeyedCoProcessOperator等operator均实现了Triggerable接口，可以响应timer的onEventTime或onProcessingTime的回调小结TimerService接口定义了currentProcessingTime、currentWatermark、registerProcessingTimeTimer、registerEventTimeTimer、deleteProcessingTimeTimer、deleteEventTimeTimer接口；它有一个实现类为SimpleTimerService，而SimpleTimerService主要是委托给InternalTimerService来实现InternalTimerService是TimerService的internal版本的接口，比起TimerService它定义了namespace，在registerProcessingTimeTimer、deleteProcessingTimeTimer、registerEventTimeTimer、deleteEventTimeTimer的方法中均多了一个namesapce的参数；它的实现类为InternalTimerServiceImpl；InternalTimerServiceImpl实现了InternalTimerService及ProcessingTimeCallback(定义了onProcessingTime方法)接口，其registerEventTimeTimer及deleteEventTimeTimer方法使用的是eventTimeTimersQueue；registerProcessingTimeTimer及deleteProcessingTimeTimer方法使用的是processingTimeTimersQueue(eventTimeTimersQueue及processingTimeTimersQueue的类型为KeyGroupedInternalPriorityQueue，queue的元素类型为TimerHeapInternalTimer)InternalTimerServiceImpl的eventTimerTimer的触发主要是在advanceWatermark方法中(AbstractStreamOperator的processWatermark方法会调用InternalTimeServiceManager的advanceWatermark方法，而该方法调用的是InternalTimerServiceImpl的advanceWatermark方法)，它会移除timestamp小于等于指定time的eventTimerTimer，然后回调triggerTarget.onEventTime方法InternalTimerServiceImpl的processingTimeTimer的触发则是在onProcessingTime方法中(SystemProcessingTimeService的TriggerTask及RepeatedTriggerTask的定时任务会回调ProcessingTimeCallback的onProcessingTime方法)，它会移除timestamp小于等于指定time的processingTimeTimer，然后回调triggerTarget.onProcessingTime方法Triggerable接口定义了InternalTimerService会调用的onEventTime及onProcessingTime方法；WindowOperator、IntervalJoinOperator、KeyedProcessOperator、KeyedCoProcessOperator等operator均实现了Triggerable接口，可以响应timer的onEventTime或onProcessingTime的回调docTimerService ...

《从0到1学习Flink》—— Flink 写入数据到 Kafka

前言之前文章《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch 写了如何将 Kafka 中的数据存储到 ElasticSearch 中，里面其实就已经用到了 Flink 自带的 Kafka source connector（FlinkKafkaConsumer）。存入到 ES 只是其中一种情况，那么如果我们有多个地方需要这份通过 Flink 转换后的数据，是不是又要我们继续写个 sink 的插件呢？确实，所以 Flink 里面就默认支持了不少 sink，比如也支持 Kafka sink connector（FlinkKafkaProducer），那么这篇文章我们就讲讲如何将数据写入到 Kafka。准备添加依赖Flink 里面支持 Kafka 0.8、0.9、0.10、0.11 ，以后有时间可以分析下源码的实现。这里我们需要安装下 Kafka，请对应添加对应的 Flink Kafka connector 依赖的版本，这里我们使用的是 0.11 版本：<dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka-0.11_2.11</artifactId> <version>${flink.version}</version></dependency>Kafka 安装这里就不写这块内容了，可以参考我以前的文章 Kafka 安装及快速入门。这里我们演示把其他 Kafka 集群中 topic 数据原样写入到自己本地起的 Kafka 中去。配置文件kafka.brokers=xxx:9092,xxx:9092,xxx:9092kafka.group.id=metrics-group-testkafka.zookeeper.connect=xxx:2181metrics.topic=xxxstream.parallelism=5kafka.sink.brokers=localhost:9092kafka.sink.topic=metric-teststream.checkpoint.interval=1000stream.checkpoint.enable=falsestream.sink.parallelism=5目前我们先看下本地 Kafka 是否有这个 metric-test topic 呢？需要执行下这个命令：bin/kafka-topics.sh –list –zookeeper localhost:2181可以看到本地的 Kafka 是没有任何 topic 的，如果等下我们的程序运行起来后，再次执行这个命令出现 metric-test topic，那么证明我的程序确实起作用了，已经将其他集群的 Kafka 数据写入到本地 Kafka 了。程序代码Main.javapublic class Main { public static void main(String[] args) throws Exception{ final ParameterTool parameterTool = ExecutionEnvUtil.createParameterTool(args); StreamExecutionEnvironment env = ExecutionEnvUtil.prepare(parameterTool); DataStreamSource<Metrics> data = KafkaConfigUtil.buildSource(env); data.addSink(new FlinkKafkaProducer011<Metrics>( parameterTool.get(“kafka.sink.brokers”), parameterTool.get(“kafka.sink.topic”), new MetricSchema() )).name(“flink-connectors-kafka”) .setParallelism(parameterTool.getInt(“stream.sink.parallelism”)); env.execute(“flink learning connectors kafka”); }}运行结果启动程序，查看运行结果，不段执行上面命令，查看是否有新的 topic 出来：执行命令可以查看该 topic 的信息：bin/kafka-topics.sh –describe –zookeeper localhost:2181 –topic metric-test分析上面代码我们使用 Flink Kafka Producer 只传了三个参数：brokerList、topicId、serializationSchema（序列化）其实也可以传入多个参数进去，现在有的参数用的是默认参数，因为这个内容比较多，后面可以抽出一篇文章单独来讲。总结本篇文章写了 Flink 读取其他 Kafka 集群的数据，然后写入到本地的 Kafka 上。我在 Flink 这层没做什么数据转换，只是原样的将数据转发了下，如果你们有什么其他的需求，是可以在 Flink 这层将数据进行各种转换操作，比如这篇文章中的一些转换：《从0到1学习Flink》—— Flink Data transformation(转换)，然后将转换后的数据发到 Kafka 上去。本文原创地址是: http://www.54tianzhisheng.cn/2019/01/06/Flink-Kafka-sink/ , 未经允许禁止转载。关注我微信公众号：zhisheng另外我自己整理了些 Flink 的学习资料，目前已经全部放到微信公众号了。你可以加我的微信：zhisheng_tian，然后回复关键字：Flink 即可无条件获取到。Github 代码仓库https://github.com/zhisheng17/flink-learning/以后这个项目的所有代码都将放在这个仓库里，包含了自己学习 flink 的一些 demo 和博客相关文章1、《从0到1学习Flink》—— Apache Flink 介绍2、《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门3、《从0到1学习Flink》—— Flink 配置文件详解4、《从0到1学习Flink》—— Data Source 介绍5、《从0到1学习Flink》—— 如何自定义 Data Source ？6、《从0到1学习Flink》—— Data Sink 介绍7、《从0到1学习Flink》—— 如何自定义 Data Sink ？8、《从0到1学习Flink》—— Flink Data transformation(转换)9、《从0到1学习Flink》—— 介绍Flink中的Stream Windows10、《从0到1学习Flink》—— Flink 中的几种 Time 详解11、《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch12、《从0到1学习Flink》—— Flink 项目如何运行？13、《从0到1学习Flink》—— Flink 写入数据到 Kafka ...

聊聊flink的ProcessFunction

序本文主要研究一下flink的ProcessFunction实例import org.apache.flink.api.common.state.ValueState;import org.apache.flink.api.common.state.ValueStateDescriptor;import org.apache.flink.api.java.tuple.Tuple2;import org.apache.flink.configuration.Configuration;import org.apache.flink.streaming.api.functions.ProcessFunction;import org.apache.flink.streaming.api.functions.ProcessFunction.Context;import org.apache.flink.streaming.api.functions.ProcessFunction.OnTimerContext;import org.apache.flink.util.Collector;// the source data streamDataStream<Tuple2<String, String>> stream = …;// apply the process function onto a keyed streamDataStream<Tuple2<String, Long>> result = stream .keyBy(0) .process(new CountWithTimeoutFunction());/** * The data type stored in the state /public class CountWithTimestamp { public String key; public long count; public long lastModified;}/* * The implementation of the ProcessFunction that maintains the count and timeouts /public class CountWithTimeoutFunction extends ProcessFunction<Tuple2<String, String>, Tuple2<String, Long>> { /* The state that is maintained by this process function */ private ValueState<CountWithTimestamp> state; @Override public void open(Configuration parameters) throws Exception { state = getRuntimeContext().getState(new ValueStateDescriptor<>(“myState”, CountWithTimestamp.class)); } @Override public void processElement(Tuple2<String, String> value, Context ctx, Collector<Tuple2<String, Long>> out) throws Exception { // retrieve the current count CountWithTimestamp current = state.value(); if (current == null) { current = new CountWithTimestamp(); current.key = value.f0; } // update the state’s count current.count++; // set the state’s timestamp to the record’s assigned event time timestamp current.lastModified = ctx.timestamp(); // write the state back state.update(current); // schedule the next timer 60 seconds from the current event time ctx.timerService().registerEventTimeTimer(current.lastModified + 60000); } @Override public void onTimer(long timestamp, OnTimerContext ctx, Collector<Tuple2<String, Long>> out) throws Exception { // get the state for the key that scheduled the timer CountWithTimestamp result = state.value(); // check if this is an outdated timer or the latest timer if (timestamp == result.lastModified + 60000) { // emit the state on timeout out.collect(new Tuple2<String, Long>(result.key, result.count)); } }}本实例展示了如何在ProcessFunction里头使用keyed state以及timer；process方法使用的ProcessFunction是CountWithTimeoutFunctionCountWithTimeoutFunction的open方法创建了CountWithTimestamp类型的ValueState；processElement方法里头会更新该ValueState，用于记录每个key的element个数以及最后访问时间，然后注册一个EventTimeTimer，在当前eventTime时间的60秒后到达onTimer用于响应timer，它会判断如果该key在60秒内没有被update，则emit相关数据ProcessFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/ProcessFunction.java@PublicEvolvingpublic abstract class ProcessFunction<I, O> extends AbstractRichFunction { private static final long serialVersionUID = 1L; public abstract void processElement(I value, Context ctx, Collector<O> out) throws Exception; public void onTimer(long timestamp, OnTimerContext ctx, Collector<O> out) throws Exception {} public abstract class Context { public abstract Long timestamp(); public abstract TimerService timerService(); public abstract <X> void output(OutputTag<X> outputTag, X value); } public abstract class OnTimerContext extends Context { public abstract TimeDomain timeDomain(); }}ProcessFunction继承了AbstractRichFunction(可以通过RuntimeContext获取keyed state)，它定义了抽象方法processElement以及抽象类Context、OnTimerContextContext里头有三个抽象方法，分别是timestamp、timerService、output；OnTimerContext继承了Context，它定义了timeDomain方法ProcessFunction还定义了onTimer方法，用于响应TimerService触发的timer小结ProcessFunction是low-level的stream处理操作，它相当于可以访问keyed state及timer的FlatMapFunction，当要使用keyed state或者timer的时候，可以使用ProcessFunctionProcessFunction继承了AbstractRichFunction(可以通过RuntimeContext获取keyed state)，它定义了抽象方法processElement以及抽象类Context、OnTimerContextContext里头有三个抽象方法，分别是timestamp、timerService、output；OnTimerContext继承了Context，它定义了timeDomain方法；ProcessFunction还定义了onTimer方法，用于响应TimerService触发的timerdocProcess Function (Low-level Operations) ...

《从0到1学习Flink》—— Flink 项目如何运行？

前言之前写了不少 Flink 文章了，也有不少 demo，但是文章写的时候都是在本地直接运行 Main 类的 main 方法，其实 Flink 是支持在 UI 上上传 Flink Job 的 jar 包，然后运行得。最开始在第一篇《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门中其实提到过了 Flink 自带的 UI 界面，今天我们就来看看如何将我们的项目打包在这里发布运行。准备编译打包项目代码就拿我之前的文章《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch 吧，代码地址是在 GitHub 仓库地址：https://github.com/zhisheng17/flink-learning/tree/master/flink-learning-connectors/flink-learning-connectors-es6 ，如果感兴趣的可以直接拿来打包试试水。我们在整个项目（flink-learning）pom.xml 所在文件夹执行以下命令打包：mvn clean install然后你会发现在 flink-learning-connectors-es6 的 target 目录下有 flink-learning-connectors-es6-1.0-SNAPSHOT.jar 。启动 ES注意你的 Kafka 数据源和 ES 都已经启动好了, 清空了下 ES 目录下的 data 数据，为了就是查看是不是真的有数据存入进来了。提交 jar 包将此文件提交到 Flinkserver 上，如下图：点击下图红框中的"Upload"按钮：如下图，选中刚刚上传的文件，填写类名，再点击"Submit"按钮即可启动 Job：查看运行结果如下图，在 Overview 页面可见正在运行的任务：你可以看到 Task Manager 中关于任务的 metric 数据、日志信息以及 Stdout 信息。查看 Kibana ，此时 ES 中已经有数据了：我们可以在 flink ui 界面上的 overview cancel 这个 job，那么可以看到 job 的日志：总结本篇文章写了下如何将我们的 job 编译打包并提交到 Flink 自带到 Server UI 上面去运行，也算是对前面文章的一个补充，当然了，Flink job 不仅支持这种模式的运行，它还可以运行在 K8s，Mesos，等上面，等以后我接触到再写写。本文原创地址是: http://www.54tianzhisheng.cn/2019/01/05/Flink-run/ , 未经允许禁止转载。关注我微信公众号：zhisheng另外我自己整理了些 Flink 的学习资料，目前已经全部放到微信公众号了。你可以加我的微信：zhisheng_tian，然后回复关键字：Flink 即可无条件获取到。Github 代码仓库https://github.com/zhisheng17/flink-learning/以后这个项目的所有代码都将放在这个仓库里，包含了自己学习 flink 的一些 demo 和博客相关文章1、《从0到1学习Flink》—— Apache Flink 介绍2、《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门3、《从0到1学习Flink》—— Flink 配置文件详解4、《从0到1学习Flink》—— Data Source 介绍5、《从0到1学习Flink》—— 如何自定义 Data Source ？6、《从0到1学习Flink》—— Data Sink 介绍7、《从0到1学习Flink》—— 如何自定义 Data Sink ？8、《从0到1学习Flink》—— Flink Data transformation(转换)9、《从0到1学习Flink》—— 介绍Flink中的Stream Windows10、《从0到1学习Flink》—— Flink 中的几种 Time 详解11、《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch12、《从0到1学习Flink》—— Flink 项目如何运行？13、《从0到1学习Flink》—— Flink 写入数据到 Kafka ...

聊聊flink DataStream的iterate操作

序本文主要研究一下flink DataStream的iterate操作实例IterativeStream<Long> iteration = initialStream.iterate();DataStream<Long> iterationBody = iteration.map (/do something/);DataStream<Long> feedback = iterationBody.filter(new FilterFunction<Long>(){ @Override public boolean filter(Long value) throws Exception { return value > 0; }});iteration.closeWith(feedback);DataStream<Long> output = iterationBody.filter(new FilterFunction<Long>(){ @Override public boolean filter(Long value) throws Exception { return value <= 0; }});本实例展示了IterativeStream的一些基本用法，使用iterate创建IterativeStream，使用IterativeStream的closeWith方法来关闭feedbackStreamDataStream.iterateflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java@Publicpublic class DataStream<T> { //…… @PublicEvolving public IterativeStream<T> iterate() { return new IterativeStream<>(this, 0); } @PublicEvolving public IterativeStream<T> iterate(long maxWaitTimeMillis) { return new IterativeStream<>(this, maxWaitTimeMillis); } //……}DataStream提供了两个iterate方法，它们创建并返回IterativeStream，无参的iterate方法其maxWaitTimeMillis为0IterativeStreamflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/IterativeStream.java@PublicEvolvingpublic class IterativeStream<T> extends SingleOutputStreamOperator<T> { // We store these so that we can create a co-iteration if we need to private DataStream<T> originalInput; private long maxWaitTime; protected IterativeStream(DataStream<T> dataStream, long maxWaitTime) { super(dataStream.getExecutionEnvironment(), new FeedbackTransformation<>(dataStream.getTransformation(), maxWaitTime)); this.originalInput = dataStream; this.maxWaitTime = maxWaitTime; setBufferTimeout(dataStream.environment.getBufferTimeout()); } @SuppressWarnings({ “unchecked”, “rawtypes” }) public DataStream<T> closeWith(DataStream<T> feedbackStream) { Collection<StreamTransformation<?>> predecessors = feedbackStream.getTransformation().getTransitivePredecessors(); if (!predecessors.contains(this.transformation)) { throw new UnsupportedOperationException( “Cannot close an iteration with a feedback DataStream that does not originate from said iteration.”); } ((FeedbackTransformation) getTransformation()).addFeedbackEdge(feedbackStream.getTransformation()); return feedbackStream; } public <F> ConnectedIterativeStreams<T, F> withFeedbackType(Class<F> feedbackTypeClass) { return withFeedbackType(TypeInformation.of(feedbackTypeClass)); } public <F> ConnectedIterativeStreams<T, F> withFeedbackType(TypeHint<F> feedbackTypeHint) { return withFeedbackType(TypeInformation.of(feedbackTypeHint)); } public <F> ConnectedIterativeStreams<T, F> withFeedbackType(TypeInformation<F> feedbackType) { return new ConnectedIterativeStreams<>(originalInput, feedbackType, maxWaitTime); } @Public public static class ConnectedIterativeStreams<I, F> extends ConnectedStreams<I, F> { private CoFeedbackTransformation<F> coFeedbackTransformation; public ConnectedIterativeStreams(DataStream input, TypeInformation<F> feedbackType, long waitTime) { super(input.getExecutionEnvironment(), input, new DataStream<>(input.getExecutionEnvironment(), new CoFeedbackTransformation<>(input.getParallelism(), feedbackType, waitTime))); this.coFeedbackTransformation = (CoFeedbackTransformation<F>) getSecondInput().getTransformation(); } public DataStream<F> closeWith(DataStream<F> feedbackStream) { Collection<StreamTransformation<?>> predecessors = feedbackStream.getTransformation().getTransitivePredecessors(); if (!predecessors.contains(this.coFeedbackTransformation)) { throw new UnsupportedOperationException( “Cannot close an iteration with a feedback DataStream that does not originate from said iteration.”); } coFeedbackTransformation.addFeedbackEdge(feedbackStream.getTransformation()); return feedbackStream; } private UnsupportedOperationException groupingException = new UnsupportedOperationException(“Cannot change the input partitioning of an” + “iteration head directly. Apply the partitioning on the input and” + “feedback streams instead.”); @Override public ConnectedStreams<I, F> keyBy(int[] keyPositions1, int[] keyPositions2) { throw groupingException; } @Override public ConnectedStreams<I, F> keyBy(String field1, String field2) { throw groupingException; } @Override public ConnectedStreams<I, F> keyBy(String[] fields1, String[] fields2) { throw groupingException; } @Override public ConnectedStreams<I, F> keyBy(KeySelector<I, ?> keySelector1, KeySelector<F, ?> keySelector2) { throw groupingException; } @Override public <KEY> ConnectedStreams<I, F> keyBy(KeySelector<I, KEY> keySelector1, KeySelector<F, KEY> keySelector2, TypeInformation<KEY> keyType) { throw groupingException; } }}IterativeStream继承了SingleOutputStreamOperator，它的构造器接收两个参数，一个是originalInput，一个是maxWaitTime；它根据dataStream.getTransformation()及maxWaitTime创建FeedbackTransformation；构造器同时会根据dataStream.environment.getBufferTimeout()参数来设置transformation的bufferTimeoutIterativeStream主要提供了两个方法，一个是closeWith方法，用于close iteration，它主要用于定义要被feedback到iteration头部的这部分iteration(可以理解为回流，或者类似递归的操作，filter控制的是递归的条件，通过filter的elements会重新进入IterativeStream的头部继续参与后面的运算操作)；withFeedbackType方法创建了ConnectedIterativeStreamsConnectedIterativeStreams继承了ConnectedStreams，它允许要被feedback的iteration的类型与originalInput的类型不一样，它也定义了closeWith方法，但是它覆盖了ConnectedStreams的keyBy方法，抛出UnsupportedOperationException异常FeedbackTransformationflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/transformations/FeedbackTransformation.java@Internalpublic class FeedbackTransformation<T> extends StreamTransformation<T> { private final StreamTransformation<T> input; private final List<StreamTransformation<T>> feedbackEdges; private final Long waitTime; public FeedbackTransformation(StreamTransformation<T> input, Long waitTime) { super(“Feedback”, input.getOutputType(), input.getParallelism()); this.input = input; this.waitTime = waitTime; this.feedbackEdges = Lists.newArrayList(); } public StreamTransformation<T> getInput() { return input; } public void addFeedbackEdge(StreamTransformation<T> transform) { if (transform.getParallelism() != this.getParallelism()) { throw new UnsupportedOperationException( “Parallelism of the feedback stream must match the parallelism of the original” + " stream. Parallelism of original stream: " + this.getParallelism() + “; parallelism of feedback stream: " + transform.getParallelism() + “. Parallelism can be modified using DataStream#setParallelism() method”); } feedbackEdges.add(transform); } public List<StreamTransformation<T>> getFeedbackEdges() { return feedbackEdges; } public Long getWaitTime() { return waitTime; } @Override public final void setChainingStrategy(ChainingStrategy strategy) { throw new UnsupportedOperationException(“Cannot set chaining strategy on Split Transformation.”); } @Override public Collection<StreamTransformation<?>> getTransitivePredecessors() { List<StreamTransformation<?>> result = Lists.newArrayList(); result.add(this); result.addAll(input.getTransitivePredecessors()); return result; }}FeedbackTransformation继承了StreamTransformation，它有feedbackEdges、waitTime等属性addFeedbackEdge方法用于添加一个a feedback edge，IterativeStream的closeWith方法会调用addFeedbackEdge来添加一个StreamTransformationwaitTime指定的是feedback operator等待feedback elements的时间，一旦过了waitTime则operation会关闭，不再接受新的feedback elements小结DataStream提供了两个iterate方法，它们创建并返回IterativeStream，无参的iterate方法其maxWaitTimeMillis为0IterativeStream的构造器接收两个参数，一个是originalInput，一个是maxWaitTime；它根据dataStream.getTransformation()及maxWaitTime创建FeedbackTransformation；构造器同时会根据dataStream.environment.getBufferTimeout()参数来设置transformation的bufferTimeout；FeedbackTransformation继承了StreamTransformation，它有feedbackEdges、waitTime等属性，waitTime指定的是feedback operator等待feedback elements的时间，一旦过了waitTime则operation会关闭，不再接受新的feedback elementsIterativeStream继承了SingleOutputStreamOperator，它主要提供了两个方法，一个是closeWith方法，用于close iteration，它主要用于定义要被feedback到iteration头部的这部分iteration；withFeedbackType方法创建了ConnectedIterativeStreams，ConnectedIterativeStreams继承了ConnectedStreams，它允许要被feedback的iteration的类型与originalInput的类型不一样，它也定义了closeWith方法，但是它覆盖了ConnectedStreams的keyBy方法，抛出UnsupportedOperationException异常docDataStream Transformations ...

《从0到1学习Flink》—— 介绍Flink中的Stream Windows

前言目前有许多数据分析的场景从批处理到流处理的演变，虽然可以将批处理作为流处理的特殊情况来处理，但是分析无穷集的流数据通常需要思维方式的转变并且具有其自己的术语（例如，“windowing（窗口化）”、“at-least-once（至少一次）”、“exactly-once（只有一次）” ）。对于刚刚接触流处理的人来说，这种转变和新术语可能会非常混乱。 Apache Flink 是一个为生产环境而生的流处理器，具有易于使用的 API，可以用于定义高级流分析程序。Flink 的 API 在数据流上具有非常灵活的窗口定义，使其在其他开源流处理框架中脱颖而出。在这篇文章中，我们将讨论用于流处理的窗口的概念，介绍 Flink 的内置窗口，并解释它对自定义窗口语义的支持。什么是 Windows？下面我们结合一个现实的例子来说明。就拿交通传感器的示例：统计经过某红绿灯的汽车数量之和？假设在一个红绿灯处，我们每隔 15 秒统计一次通过此红绿灯的汽车数量，如下图：可以把汽车的经过看成一个流，无穷的流，不断有汽车经过此红绿灯，因此无法统计总共的汽车数量。但是，我们可以换一种思路，每隔 15 秒，我们都将与上一次的结果进行 sum 操作（滑动聚合），如下：这个结果似乎还是无法回答我们的问题，根本原因在于流是无界的，我们不能限制流，但可以在有一个有界的范围内处理无界的流数据。因此，我们需要换一个问题的提法：每分钟经过某红绿灯的汽车数量之和？这个问题，就相当于一个定义了一个 Window（窗口），window 的界限是1分钟，且每分钟内的数据互不干扰，因此也可以称为翻滚（不重合）窗口，如下图：第一分钟的数量为8，第二分钟是22，第三分钟是27。。。这样，1个小时内会有60个window。再考虑一种情况，每30秒统计一次过去1分钟的汽车数量之和：此时，window 出现了重合。这样，1个小时内会有120个 window。扩展一下，我们可以在某个地区，收集每一个红绿灯处汽车经过的数量，然后每个红绿灯处都做一次基于1分钟的window统计，即并行处理：它有什么作用？通常来讲，Window 就是用来对一个无限的流设置一个有限的集合，在有界的数据集上进行操作的一种机制。window 又可以分为基于时间（Time-based）的 window 以及基于数量（Count-based）的 window。Flink 自带的 windowFlink DataStream API 提供了 Time 和 Count 的 window，同时增加了基于 Session 的 window。同时，由于某些特殊的需要，DataStream API 也提供了定制化的 window 操作，供用户自定义 window。下面，主要介绍 Time-Based window 以及 Count-Based window，以及自定义的 window 操作，Session-Based Window 操作将会在后续的文章中讲到。Time Windows正如命名那样，Time Windows 根据时间来聚合流数据。例如：一分钟的 tumbling time window 收集一分钟的元素，并在一分钟过后对窗口中的所有元素应用于一个函数。在 Flink 中定义 tumbling time windows(翻滚时间窗口) 和 sliding time windows(滑动时间窗口) 非常简单：tumbling time windows(翻滚时间窗口)输入一个时间参数data.keyBy(1) .timeWindow(Time.minutes(1)) //tumbling time window 每分钟统计一次数量和 .sum(1);sliding time windows(滑动时间窗口)输入两个时间参数data.keyBy(1) .timeWindow(Time.minutes(1), Time.seconds(30)) //sliding time window 每隔 30s 统计过去一分钟的数量和 .sum(1);有一点我们还没有讨论，即“收集一分钟的元素”的确切含义，它可以归结为一个问题，“流处理器如何解释时间?”Apache Flink 具有三个不同的时间概念，即 processing time, event time 和 ingestion time。这里可以参考我下一篇文章：《从0到1学习Flink》—— 介绍Flink中的Event Time、Processing Time和Ingestion TimeCount WindowsApache Flink 还提供计数窗口功能。如果计数窗口设置的为 100 ，那么将会在窗口中收集 100 个事件，并在添加第 100 个元素时计算窗口的值。在 Flink 的 DataStream API 中，tumbling count window 和 sliding count window 的定义如下:tumbling count window输入一个时间参数data.keyBy(1) .countWindow(100) //统计每 100 个元素的数量之和 .sum(1);sliding count window输入两个时间参数data.keyBy(1) .countWindow(100, 10) //每 10 个元素统计过去 100 个元素的数量之和 .sum(1);解剖 Flink 的窗口机制Flink 的内置 time window 和 count window 已经覆盖了大多数应用场景，但是有时候也需要定制窗口逻辑，此时 Flink 的内置的 window 无法解决这些问题。为了还支持自定义 window 实现不同的逻辑，DataStream API 为其窗口机制提供了接口。下图描述了 Flink 的窗口机制，并介绍了所涉及的组件：到达窗口操作符的元素被传递给 WindowAssigner。WindowAssigner 将元素分配给一个或多个窗口，可能会创建新的窗口。窗口本身只是元素列表的标识符，它可能提供一些可选的元信息，例如 TimeWindow 中的开始和结束时间。注意，元素可以被添加到多个窗口，这也意味着一个元素可以同时在多个窗口存在。每个窗口都拥有一个 Trigger(触发器)，该 Trigger(触发器) 决定何时计算和清除窗口。当先前注册的计时器超时时，将为插入窗口的每个元素调用触发器。在每个事件上，触发器都可以决定触发(即、清除(删除窗口并丢弃其内容)，或者启动并清除窗口。一个窗口可以被求值多次，并且在被清除之前一直存在。注意，在清除窗口之前，窗口将一直消耗内存。当 Trigger(触发器) 触发时，可以将窗口元素列表提供给可选的 Evictor，Evictor 可以遍历窗口元素列表，并可以决定从列表的开头删除首先进入窗口的一些元素。然后其余的元素被赋给一个计算函数，如果没有定义 Evictor，触发器直接将所有窗口元素交给计算函数。计算函数接收 Evictor 过滤后的窗口元素，并计算窗口的一个或多个元素的结果。 DataStream API 接受不同类型的计算函数，包括预定义的聚合函数，如 sum（），min（），max（），以及 ReduceFunction，FoldFunction 或 WindowFunction。这些是构成 Flink 窗口机制的组件。接下来我们逐步演示如何使用 DataStream API 实现自定义窗口逻辑。我们从 DataStream [IN] 类型的流开始，并使用 key 选择器函数对其分组，该函数将 key 相同类型的数据分组在一块。SingleOutputStreamOperator<xxx> data = env.addSource(…);data.keyBy()如何自定义 Window？1、Window Assigner负责将元素分配到不同的 window。Window API 提供了自定义的 WindowAssigner 接口，我们可以实现 WindowAssigner 的public abstract Collection<W> assignWindows(T element, long timestamp)方法。同时，对于基于 Count 的 window 而言，默认采用了 GlobalWindow 的 window assigner，例如：keyBy.window(GlobalWindows.create())2、Trigger Trigger 即触发器，定义何时或什么情况下移除 window我们可以指定触发器来覆盖 WindowAssigner 提供的默认触发器。请注意，指定的触发器不会添加其他触发条件，但会替换当前触发器。3、Evictor（可选）驱逐者，即保留上一 window 留下的某些元素4、通过 apply WindowFunction 来返回 DataStream 类型数据。利用 Flink 的内部窗口机制和 DataStream API 可以实现自定义的窗口逻辑，例如 session window。结论对于现代流处理器来说，支持连续数据流上的各种类型的窗口是必不可少的。 Apache Flink 是一个具有强大功能集的流处理器，包括一个非常灵活的机制，可以在连续数据流上构建窗口。 Flink 为常见场景提供内置的窗口运算符，以及允许用户自定义窗口逻辑。参考1、https://flink.apache.org/news…2、https://blog.csdn.net/lmalds/…关注我转载请务必注明原创地址为：http://www.54tianzhisheng.cn/2018/12/08/Flink-Stream-Windows/微信公众号：zhisheng另外我自己整理了些 Flink 的学习资料，目前已经全部放到微信公众号了。你可以加我的微信：zhisheng_tian，然后回复关键字：Flink 即可无条件获取到。Github 代码仓库https://github.com/zhisheng17/flink-learning/以后这个项目的所有代码都将放在这个仓库里，包含了自己学习 flink 的一些 demo 和博客相关文章1、《从0到1学习Flink》—— Apache Flink 介绍2、《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门3、《从0到1学习Flink》—— Flink 配置文件详解4、《从0到1学习Flink》—— Data Source 介绍5、《从0到1学习Flink》—— 如何自定义 Data Source ？6、《从0到1学习Flink》—— Data Sink 介绍7、《从0到1学习Flink》—— 如何自定义 Data Sink ？8、《从0到1学习Flink》—— Flink Data transformation(转换)9、《从0到1学习Flink》—— 介绍Flink中的Stream Windows10、《从0到1学习Flink》—— Flink 中的几种 Time 详解11、《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch12、《从0到1学习Flink》—— Flink 项目如何运行？13、《从0到1学习Flink》—— Flink 写入数据到 Kafka ...

聊聊flink DataStream的split操作

序本文主要研究一下flink DataStream的split操作实例SplitStream<Integer> split = someDataStream.split(new OutputSelector<Integer>() { @Override public Iterable<String> select(Integer value) { List<String> output = new ArrayList<String>(); if (value % 2 == 0) { output.add(“even”); } else { output.add(“odd”); } return output; }});本实例将dataStream split为两个dataStream，一个outputName为even，另一个outputName为oddDataStream.splitflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java@Publicpublic class DataStream<T> { //…… public SplitStream<T> split(OutputSelector<T> outputSelector) { return new SplitStream<>(this, clean(outputSelector)); } //……}DataStream的split操作接收OutputSelector参数，然后创建并返回SplitStreamOutputSelectorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/collector/selector/OutputSelector.java@PublicEvolvingpublic interface OutputSelector<OUT> extends Serializable { Iterable<String> select(OUT value);}OutputSelector定义了select方法用于给element打上outputNamesSplitStreamflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/SplitStream.java@PublicEvolvingpublic class SplitStream<OUT> extends DataStream<OUT> { protected SplitStream(DataStream<OUT> dataStream, OutputSelector<OUT> outputSelector) { super(dataStream.getExecutionEnvironment(), new SplitTransformation<OUT>(dataStream.getTransformation(), outputSelector)); } public DataStream<OUT> select(String… outputNames) { return selectOutput(outputNames); } private DataStream<OUT> selectOutput(String[] outputNames) { for (String outName : outputNames) { if (outName == null) { throw new RuntimeException(“Selected names must not be null”); } } SelectTransformation<OUT> selectTransform = new SelectTransformation<OUT>(this.getTransformation(), Lists.newArrayList(outputNames)); return new DataStream<OUT>(this.getExecutionEnvironment(), selectTransform); }}SplitStream继承了DataStream，它定义了select方法，可以用来根据outputNames选择split出来的dataStream；select方法创建了SelectTransformationStreamGraphGeneratorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/graph/StreamGraphGenerator.java@Internalpublic class StreamGraphGenerator { //…… private Collection<Integer> transform(StreamTransformation<?> transform) { if (alreadyTransformed.containsKey(transform)) { return alreadyTransformed.get(transform); } LOG.debug(“Transforming " + transform); if (transform.getMaxParallelism() <= 0) { // if the max parallelism hasn’t been set, then first use the job wide max parallelism // from theExecutionConfig. int globalMaxParallelismFromConfig = env.getConfig().getMaxParallelism(); if (globalMaxParallelismFromConfig > 0) { transform.setMaxParallelism(globalMaxParallelismFromConfig); } } // call at least once to trigger exceptions about MissingTypeInfo transform.getOutputType(); Collection<Integer> transformedIds; if (transform instanceof OneInputTransformation<?, ?>) { transformedIds = transformOneInputTransform((OneInputTransformation<?, ?>) transform); } else if (transform instanceof TwoInputTransformation<?, ?, ?>) { transformedIds = transformTwoInputTransform((TwoInputTransformation<?, ?, ?>) transform); } else if (transform instanceof SourceTransformation<?>) { transformedIds = transformSource((SourceTransformation<?>) transform); } else if (transform instanceof SinkTransformation<?>) { transformedIds = transformSink((SinkTransformation<?>) transform); } else if (transform instanceof UnionTransformation<?>) { transformedIds = transformUnion((UnionTransformation<?>) transform); } else if (transform instanceof SplitTransformation<?>) { transformedIds = transformSplit((SplitTransformation<?>) transform); } else if (transform instanceof SelectTransformation<?>) { transformedIds = transformSelect((SelectTransformation<?>) transform); } else if (transform instanceof FeedbackTransformation<?>) { transformedIds = transformFeedback((FeedbackTransformation<?>) transform); } else if (transform instanceof CoFeedbackTransformation<?>) { transformedIds = transformCoFeedback((CoFeedbackTransformation<?>) transform); } else if (transform instanceof PartitionTransformation<?>) { transformedIds = transformPartition((PartitionTransformation<?>) transform); } else if (transform instanceof SideOutputTransformation<?>) { transformedIds = transformSideOutput((SideOutputTransformation<?>) transform); } else { throw new IllegalStateException(“Unknown transformation: " + transform); } // need this check because the iterate transformation adds itself before // transforming the feedback edges if (!alreadyTransformed.containsKey(transform)) { alreadyTransformed.put(transform, transformedIds); } if (transform.getBufferTimeout() >= 0) { streamGraph.setBufferTimeout(transform.getId(), transform.getBufferTimeout()); } if (transform.getUid() != null) { streamGraph.setTransformationUID(transform.getId(), transform.getUid()); } if (transform.getUserProvidedNodeHash() != null) { streamGraph.setTransformationUserHash(transform.getId(), transform.getUserProvidedNodeHash()); } if (transform.getMinResources() != null && transform.getPreferredResources() != null) { streamGraph.setResources(transform.getId(), transform.getMinResources(), transform.getPreferredResources()); } return transformedIds; } private <T> Collection<Integer> transformSelect(SelectTransformation<T> select) { StreamTransformation<T> input = select.getInput(); Collection<Integer> resultIds = transform(input); // the recursive transform might have already transformed this if (alreadyTransformed.containsKey(select)) { return alreadyTransformed.get(select); } List<Integer> virtualResultIds = new ArrayList<>(); for (int inputId : resultIds) { int virtualId = StreamTransformation.getNewNodeId(); streamGraph.addVirtualSelectNode(inputId, virtualId, select.getSelectedNames()); virtualResultIds.add(virtualId); } return virtualResultIds; } private <T> Collection<Integer> transformSplit(SplitTransformation<T> split) { StreamTransformation<T> input = split.getInput(); Collection<Integer> resultIds = transform(input); // the recursive transform call might have transformed this already if (alreadyTransformed.containsKey(split)) { return alreadyTransformed.get(split); } for (int inputId : resultIds) { streamGraph.addOutputSelector(inputId, split.getOutputSelector()); } return resultIds; } //……}StreamGraphGenerator里头的transform会对SelectTransformation以及SplitTransformation进行相应的处理transformSelect方法会根据select.getSelectedNames()来addVirtualSelectNodetransformSplit方法则根据split.getOutputSelector()来addOutputSelector小结DataStream的split操作接收OutputSelector参数，然后创建并返回SplitStreamOutputSelector定义了select方法用于给element打上outputNamesSplitStream继承了DataStream，它定义了select方法，可以用来根据outputNames选择split出来的dataStreamdocDataStream Transformations ...

《从0到1学习Flink》—— Flink 中几种 Time 详解

前言Flink 在流程序中支持不同的 Time 概念，就比如有 Processing Time、Event Time 和 Ingestion Time。下面我们一起来看看这几个 Time：Processing TimeProcessing Time 是指事件被处理时机器的系统时间。当流程序在 Processing Time 上运行时，所有基于时间的操作(如时间窗口)将使用当时机器的系统时间。每小时 Processing Time 窗口将包括在系统时钟指示整个小时之间到达特定操作的所有事件。例如，如果应用程序在上午 9:15 开始运行，则第一个每小时 Processing Time 窗口将包括在上午 9:15 到上午 10:00 之间处理的事件，下一个窗口将包括在上午 10:00 到 11:00 之间处理的事件。Processing Time 是最简单的 “Time” 概念，不需要流和机器之间的协调，它提供了最好的性能和最低的延迟。但是，在分布式和异步的环境下，Processing Time 不能提供确定性，因为它容易受到事件到达系统的速度（例如从消息队列）、事件在系统内操作流动的速度以及中断的影响。Event TimeEvent Time 是事件发生的时间，一般就是数据本身携带的时间。这个时间通常是在事件到达 Flink 之前就确定的，并且可以从每个事件中获取到事件时间戳。在 Event Time 中，时间取决于数据，而跟其他没什么关系。Event Time 程序必须指定如何生成 Event Time 水印，这是表示 Event Time 进度的机制。完美的说，无论事件什么时候到达或者其怎么排序，最后处理 Event Time 将产生完全一致和确定的结果。但是，除非事件按照已知顺序（按照事件的时间）到达，否则处理 Event Time 时将会因为要等待一些无序事件而产生一些延迟。由于只能等待一段有限的时间，因此就难以保证处理 Event Time 将产生完全一致和确定的结果。假设所有数据都已到达， Event Time 操作将按照预期运行，即使在处理无序事件、延迟事件、重新处理历史数据时也会产生正确且一致的结果。例如，每小时事件时间窗口将包含带有落入该小时的事件时间戳的所有记录，无论它们到达的顺序如何。请注意，有时当 Event Time 程序实时处理实时数据时，它们将使用一些 Processing Time 操作，以确保它们及时进行。Ingestion TimeIngestion Time 是事件进入 Flink 的时间。在源操作处，每个事件将源的当前时间作为时间戳，并且基于时间的操作（如时间窗口）会利用这个时间戳。Ingestion Time 在概念上位于 Event Time 和 Processing Time 之间。与 Processing Time 相比，它稍微贵一些，但结果更可预测。因为 Ingestion Time 使用稳定的时间戳（在源处分配一次），所以对事件的不同窗口操作将引用相同的时间戳，而在 Processing Time 中，每个窗口操作符可以将事件分配给不同的窗口（基于机器系统时间和到达延迟）。与 Event Time 相比，Ingestion Time 程序无法处理任何无序事件或延迟数据，但程序不必指定如何生成水印。在 Flink 中，，Ingestion Time 与 Event Time 非常相似，但 Ingestion Time 具有自动分配时间戳和自动生成水印功能。说了这么多概念比较干涩，下面直接看图：设定时间特性Flink DataStream 程序的第一部分通常是设置基本时间特性。该设置定义了数据流源的行为方式（例如：它们是否将分配时间戳），以及像 KeyedStream.timeWindow(Time.seconds(30)) 这样的窗口操作应该使用上面哪种时间概念。以下示例显示了一个 Flink 程序，该程序在每小时时间窗口中聚合事件。final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);// 其他// env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);// env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);DataStream<MyEvent> stream = env.addSource(new FlinkKafkaConsumer09<MyEvent>(topic, schema, props));stream .keyBy( (event) -> event.getUser() ) .timeWindow(Time.hours(1)) .reduce( (a, b) -> a.add(b) ) .addSink(…);Event Time 和 Watermarks注意：Flink 实现了数据流模型中的许多技术。有关 Event Time 和 Watermarks 的详细介绍，请查看以下文章：https://www.oreilly.com/ideas…https://research.google.com/p…支持 Event Time 的流处理器需要一种方法来衡量 Event Time 的进度。例如，当 Event Time 超过一小时结束时，需要通知构建每小时窗口的窗口操作符，以便操作员可以关闭正在进行的窗口。Event Time 可以独立于 Processing Time 进行。例如，在一个程序中，操作员的当前 Event Time 可能略微落后于 Processing Time （考虑到接收事件的延迟），而两者都以相同的速度进行。另一方面，另一个流程序可能只需要几秒钟的时间就可以处理完 Kafka Topic 中数周的 Event Time 数据。Flink 中用于衡量 Event Time 进度的机制是 Watermarks。 Watermarks 作为数据流的一部分流动并带有时间戳 t。 Watermark（t）声明 Event Time 已到达该流中的时间 t，这意味着流中不应再有具有时间戳 t’<= t 的元素（即时间戳大于或等于水印的事件）下图显示了带有(逻辑)时间戳和内联水印的事件流。在本例中，事件是按顺序排列的(相对于它们的时间戳)，这意味着水印只是流中的周期性标记。Watermark 对于无序流是至关重要的，如下所示，其中事件不按时间戳排序。通常，Watermark 是一种声明，通过流中的该点，到达某个时间戳的所有事件都应该到达。一旦水印到达操作员，操作员就可以将其内部事件时间提前到水印的值。平行流中的水印水印是在源函数处生成的，或直接在源函数之后生成的。源函数的每个并行子任务通常独立生成其水印。这些水印定义了特定并行源处的事件时间。当水印通过流程序时，它们会提前到达操作人员处的事件时间。当一个操作符提前它的事件时间时，它为它的后续操作符在下游生成一个新的水印。一些操作员消耗多个输入流; 例如，一个 union，或者跟随 keyBy（…）或 partition（…）函数的运算符。这样的操作员当前事件时间是其输入流的事件时间的最小值。由于其输入流更新其事件时间，因此操作员也是如此。下图显示了流经并行流的事件和水印的示例，以及跟踪事件时间的运算符。参考https://github.com/zhisheng17…关注我转载请务必注明原创地址为：http://www.54tianzhisheng.cn/2018/12/11/Flink-time/微信公众号：zhisheng另外我自己整理了些 Flink 的学习资料，目前已经全部放到微信公众号了。你可以加我的微信：zhisheng_tian，然后回复关键字：Flink 即可无条件获取到。Github 代码仓库https://github.com/zhisheng17/flink-learning/以后这个项目的所有代码都将放在这个仓库里，包含了自己学习 flink 的一些 demo 和博客相关文章1、《从0到1学习Flink》—— Apache Flink 介绍2、《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门3、《从0到1学习Flink》—— Flink 配置文件详解4、《从0到1学习Flink》—— Data Source 介绍5、《从0到1学习Flink》—— 如何自定义 Data Source ？6、《从0到1学习Flink》—— Data Sink 介绍7、《从0到1学习Flink》—— 如何自定义 Data Sink ？8、《从0到1学习Flink》—— Flink Data transformation(转换)9、《从0到1学习Flink》—— 介绍Flink中的Stream Windows10、《从0到1学习Flink》—— Flink 中的几种 Time 详解11、《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch12、《从0到1学习Flink》—— Flink 项目如何运行？13、《从0到1学习Flink》—— Flink 写入数据到 Kafka ...

《从0到1学习Flink》—— Flink Data transformation(转换)

前言在第一篇介绍 Flink 的文章《《从0到1学习Flink》—— Apache Flink 介绍》中就说过 Flink 程序的结构Flink 应用程序结构就是如上图所示：1、Source: 数据源，Flink 在流处理和批处理上的 source 大概有 4 类：基于本地集合的 source、基于文件的 source、基于网络套接字的 source、自定义的 source。自定义的 source 常见的有 Apache kafka、Amazon Kinesis Streams、RabbitMQ、Twitter Streaming API、Apache NiFi 等，当然你也可以定义自己的 source。2、Transformation：数据转换的各种操作，有 Map / FlatMap / Filter / KeyBy / Reduce / Fold / Aggregations / Window / WindowAll / Union / Window join / Split / Select / Project 等，操作很多，可以将数据转换计算成你想要的数据。3、Sink：接收器，Flink 将转换计算后的数据发送的地点，你可能需要存储下来，Flink 常见的 Sink 大概有如下几类：写入文件、打印出来、写入 socket 、自定义的 sink 。自定义的 sink 常见的有 Apache kafka、RabbitMQ、MySQL、ElasticSearch、Apache Cassandra、Hadoop FileSystem 等，同理你也可以定义自己的 Sink。在上四篇文章介绍了 Source 和 Sink：1、《从0到1学习Flink》—— Data Source 介绍2、《从0到1学习Flink》—— 如何自定义 Data Source ？3、《从0到1学习Flink》—— Data Sink 介绍4、《从0到1学习Flink》—— 如何自定义 Data Sink ？那么这篇文章我们就来看下 Flink Data Transformation 吧，数据转换操作还是蛮多的，需要好好讲讲！TransformationMap这是最简单的转换之一，其中输入是一个数据流，输出的也是一个数据流：还是拿上一篇文章的案例来将数据进行 map 转换操作：SingleOutputStreamOperator<Student> map = student.map(new MapFunction<Student, Student>() { @Override public Student map(Student value) throws Exception { Student s1 = new Student(); s1.id = value.id; s1.name = value.name; s1.password = value.password; s1.age = value.age + 5; return s1; }});map.print();将每个人的年龄都增加 5 岁，其他不变。FlatMapFlatMap 采用一条记录并输出零个，一个或多个记录。SingleOutputStreamOperator<Student> flatMap = student.flatMap(new FlatMapFunction<Student, Student>() { @Override public void flatMap(Student value, Collector<Student> out) throws Exception { if (value.id % 2 == 0) { out.collect(value); } }});flatMap.print();这里将 id 为偶数的聚集出来。FilterFilter 函数根据条件判断出结果。SingleOutputStreamOperator<Student> filter = student.filter(new FilterFunction<Student>() { @Override public boolean filter(Student value) throws Exception { if (value.id > 95) { return true; } return false; }});filter.print();这里将 id 大于 95 的过滤出来，然后打印出来。KeyByKeyBy 在逻辑上是基于 key 对流进行分区。在内部，它使用 hash 函数对流进行分区。它返回 KeyedDataStream 数据流。KeyedStream<Student, Integer> keyBy = student.keyBy(new KeySelector<Student, Integer>() { @Override public Integer getKey(Student value) throws Exception { return value.age; }});keyBy.print();上面对 student 的 age 做 KeyBy 操作分区ReduceReduce 返回单个的结果值，并且 reduce 操作每处理一个元素总是创建一个新值。常用的方法有 average, sum, min, max, count，使用 reduce 方法都可实现。SingleOutputStreamOperator<Student> reduce = student.keyBy(new KeySelector<Student, Integer>() { @Override public Integer getKey(Student value) throws Exception { return value.age; }}).reduce(new ReduceFunction<Student>() { @Override public Student reduce(Student value1, Student value2) throws Exception { Student student1 = new Student(); student1.name = value1.name + value2.name; student1.id = (value1.id + value2.id) / 2; student1.password = value1.password + value2.password; student1.age = (value1.age + value2.age) / 2; return student1; }});reduce.print();上面先将数据流进行 keyby 操作，因为执行 reduce 操作只能是 KeyedStream，然后将 student 对象的 age 做了一个求平均值的操作。FoldFold 通过将最后一个文件夹流与当前记录组合来推出 KeyedStream。它会发回数据流。KeyedStream.fold(“1”, new FoldFunction<Integer, String>() { @Override public String fold(String accumulator, Integer value) throws Exception { return accumulator + “=” + value; }})AggregationsDataStream API 支持各种聚合，例如 min，max，sum 等。这些函数可以应用于 KeyedStream 以获得 Aggregations 聚合。KeyedStream.sum(0) KeyedStream.sum(“key”) KeyedStream.min(0) KeyedStream.min(“key”) KeyedStream.max(0) KeyedStream.max(“key”) KeyedStream.minBy(0) KeyedStream.minBy(“key”) KeyedStream.maxBy(0) KeyedStream.maxBy(“key”)max 和 maxBy 之间的区别在于 max 返回流中的最大值，但 maxBy 返回具有最大值的键， min 和 minBy 同理。WindowWindow 函数允许按时间或其他条件对现有 KeyedStream 进行分组。以下是以 10 秒的时间窗口聚合：inputStream.keyBy(0).window(Time.seconds(10));Flink 定义数据片段以便（可能）处理无限数据流。这些切片称为窗口。此切片有助于通过应用转换处理数据块。要对流进行窗口化，我们需要分配一个可以进行分发的键和一个描述要对窗口化流执行哪些转换的函数要将流切片到窗口，我们可以使用 Flink 自带的窗口分配器。我们有选项，如 tumbling windows, sliding windows, global 和 session windows。 Flink 还允许您通过扩展 WindowAssginer 类来编写自定义窗口分配器。这里先预留下篇文章来讲解这些不同的 windows 是如何工作的。WindowAllwindowAll 函数允许对常规数据流进行分组。通常，这是非并行数据转换，因为它在非分区数据流上运行。与常规数据流功能类似，我们也有窗口数据流功能。唯一的区别是它们处理窗口数据流。所以窗口缩小就像 Reduce 函数一样，Window fold 就像 Fold 函数一样，并且还有聚合。inputStream.keyBy(0).windowAll(Time.seconds(10));UnionUnion 函数将两个或多个数据流结合在一起。这样就可以并行地组合数据流。如果我们将一个流与自身组合，那么它会输出每个记录两次。inputStream.union(inputStream1, inputStream2, …);Window join我们可以通过一些 key 将同一个 window 的两个数据流 join 起来。inputStream.join(inputStream1) .where(0).equalTo(1) .window(Time.seconds(5)) .apply (new JoinFunction () {…});以上示例是在 5 秒的窗口中连接两个流，其中第一个流的第一个属性的连接条件等于另一个流的第二个属性。Split此功能根据条件将流拆分为两个或多个流。当您获得混合流并且您可能希望单独处理每个数据流时，可以使用此方法。SplitStream<Integer> split = inputStream.split(new OutputSelector<Integer>() { @Override public Iterable<String> select(Integer value) { List<String> output = new ArrayList<String>(); if (value % 2 == 0) { output.add(“even”); } else { output.add(“odd”); } return output; }});Select此功能允许您从拆分流中选择特定流。SplitStream<Integer> split;DataStream<Integer> even = split.select(“even”); DataStream<Integer> odd = split.select(“odd”); DataStream<Integer> all = split.select(“even”,“odd”);ProjectProject 函数允许您从事件流中选择属性子集，并仅将所选元素发送到下一个处理流。DataStream<Tuple4<Integer, Double, String, String>> in = // […] DataStream<Tuple2<String, String>> out = in.project(3,2);上述函数从给定记录中选择属性号 2 和 3。以下是示例输入和输出记录：(1,10.0,A,B)=> (B,A)(2,20.0,C,D)=> (D,C)最后本文主要介绍了 Flink Data 的常用转换方式：Map、FlatMap、Filter、KeyBy、Reduce、Fold、Aggregations、Window、WindowAll、Union、Window Join、Split、Select、Project 等。并用了点简单的 demo 介绍了如何使用，具体在项目中该如何将数据流转换成我们想要的格式，还需要根据实际情况对待。关注我转载请务必注明原创地址为：http://www.54tianzhisheng.cn/2018/11/04/Flink-Data-transformation/微信公众号：zhisheng另外我自己整理了些 Flink 的学习资料，目前已经全部放到微信公众号了。你可以加我的微信：zhisheng_tian，然后回复关键字：Flink 即可无条件获取到。Github 代码仓库https://github.com/zhisheng17/flink-learning/以后这个项目的所有代码都将放在这个仓库里，包含了自己学习 flink 的一些 demo 和博客相关文章1、《从0到1学习Flink》—— Apache Flink 介绍2、《从0到1学习Flink》—— Mac 上搭建 Flink 1.6.0 环境并构建运行简单程序入门3、《从0到1学习Flink》—— Flink 配置文件详解4、《从0到1学习Flink》—— Data Source 介绍5、《从0到1学习Flink》—— 如何自定义 Data Source ？6、《从0到1学习Flink》—— Data Sink 介绍7、《从0到1学习Flink》—— 如何自定义 Data Sink ？8、《从0到1学习Flink》—— Flink Data transformation(转换)9、《从0到1学习Flink》—— 介绍Flink中的Stream Windows10、《从0到1学习Flink》—— Flink 中的几种 Time 详解11、《从0到1学习Flink》—— Flink 写入数据到 ElasticSearch12、《从0到1学习Flink》—— Flink 项目如何运行？13、《从0到1学习Flink》—— Flink 写入数据到 Kafka ...

聊聊flink DataStream的connect操作

序本文主要研究一下flink DataStream的connect操作DataStream.connectflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java@Publicpublic class DataStream<T> { //…… public <R> ConnectedStreams<T, R> connect(DataStream<R> dataStream) { return new ConnectedStreams<>(environment, this, dataStream); } @PublicEvolving public <R> BroadcastConnectedStream<T, R> connect(BroadcastStream<R> broadcastStream) { return new BroadcastConnectedStream<>( environment, this, Preconditions.checkNotNull(broadcastStream), broadcastStream.getBroadcastStateDescriptor()); } //……}DataStream的connect操作创建的是ConnectedStreams或BroadcastConnectedStream，它用了两个泛型，即不要求两个dataStream的element是同一类型ConnectedStreamsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/ConnectedStreams.java@Publicpublic class ConnectedStreams<IN1, IN2> { protected final StreamExecutionEnvironment environment; protected final DataStream<IN1> inputStream1; protected final DataStream<IN2> inputStream2; protected ConnectedStreams(StreamExecutionEnvironment env, DataStream<IN1> input1, DataStream<IN2> input2) { this.environment = requireNonNull(env); this.inputStream1 = requireNonNull(input1); this.inputStream2 = requireNonNull(input2); } public StreamExecutionEnvironment getExecutionEnvironment() { return environment; } public DataStream<IN1> getFirstInput() { return inputStream1; } public DataStream<IN2> getSecondInput() { return inputStream2; } public TypeInformation<IN1> getType1() { return inputStream1.getType(); } public TypeInformation<IN2> getType2() { return inputStream2.getType(); } public ConnectedStreams<IN1, IN2> keyBy(int keyPosition1, int keyPosition2) { return new ConnectedStreams<>(this.environment, inputStream1.keyBy(keyPosition1), inputStream2.keyBy(keyPosition2)); } public ConnectedStreams<IN1, IN2> keyBy(int[] keyPositions1, int[] keyPositions2) { return new ConnectedStreams<>(environment, inputStream1.keyBy(keyPositions1), inputStream2.keyBy(keyPositions2)); } public ConnectedStreams<IN1, IN2> keyBy(String field1, String field2) { return new ConnectedStreams<>(environment, inputStream1.keyBy(field1), inputStream2.keyBy(field2)); } public ConnectedStreams<IN1, IN2> keyBy(String[] fields1, String[] fields2) { return new ConnectedStreams<>(environment, inputStream1.keyBy(fields1), inputStream2.keyBy(fields2)); } public ConnectedStreams<IN1, IN2> keyBy(KeySelector<IN1, ?> keySelector1, KeySelector<IN2, ?> keySelector2) { return new ConnectedStreams<>(environment, inputStream1.keyBy(keySelector1), inputStream2.keyBy(keySelector2)); } public <KEY> ConnectedStreams<IN1, IN2> keyBy( KeySelector<IN1, KEY> keySelector1, KeySelector<IN2, KEY> keySelector2, TypeInformation<KEY> keyType) { return new ConnectedStreams<>( environment, inputStream1.keyBy(keySelector1, keyType), inputStream2.keyBy(keySelector2, keyType)); } public <R> SingleOutputStreamOperator<R> map(CoMapFunction<IN1, IN2, R> coMapper) { TypeInformation<R> outTypeInfo = TypeExtractor.getBinaryOperatorReturnType( coMapper, CoMapFunction.class, 0, 1, 2, TypeExtractor.NO_INDEX, getType1(), getType2(), Utils.getCallLocationName(), true); return transform(“Co-Map”, outTypeInfo, new CoStreamMap<>(inputStream1.clean(coMapper))); } public <R> SingleOutputStreamOperator<R> flatMap( CoFlatMapFunction<IN1, IN2, R> coFlatMapper) { TypeInformation<R> outTypeInfo = TypeExtractor.getBinaryOperatorReturnType( coFlatMapper, CoFlatMapFunction.class, 0, 1, 2, TypeExtractor.NO_INDEX, getType1(), getType2(), Utils.getCallLocationName(), true); return transform(“Co-Flat Map”, outTypeInfo, new CoStreamFlatMap<>(inputStream1.clean(coFlatMapper))); } @PublicEvolving public <R> SingleOutputStreamOperator<R> process( CoProcessFunction<IN1, IN2, R> coProcessFunction) { TypeInformation<R> outTypeInfo = TypeExtractor.getBinaryOperatorReturnType( coProcessFunction, CoProcessFunction.class, 0, 1, 2, TypeExtractor.NO_INDEX, getType1(), getType2(), Utils.getCallLocationName(), true); return process(coProcessFunction, outTypeInfo); } @Internal public <R> SingleOutputStreamOperator<R> process( CoProcessFunction<IN1, IN2, R> coProcessFunction, TypeInformation<R> outputType) { TwoInputStreamOperator<IN1, IN2, R> operator; if ((inputStream1 instanceof KeyedStream) && (inputStream2 instanceof KeyedStream)) { operator = new KeyedCoProcessOperator<>(inputStream1.clean(coProcessFunction)); } else { operator = new CoProcessOperator<>(inputStream1.clean(coProcessFunction)); } return transform(“Co-Process”, outputType, operator); } @PublicEvolving public <R> SingleOutputStreamOperator<R> transform(String functionName, TypeInformation<R> outTypeInfo, TwoInputStreamOperator<IN1, IN2, R> operator) { // read the output type of the input Transforms to coax out errors about MissingTypeInfo inputStream1.getType(); inputStream2.getType(); TwoInputTransformation<IN1, IN2, R> transform = new TwoInputTransformation<>( inputStream1.getTransformation(), inputStream2.getTransformation(), functionName, operator, outTypeInfo, environment.getParallelism()); if (inputStream1 instanceof KeyedStream && inputStream2 instanceof KeyedStream) { KeyedStream<IN1, ?> keyedInput1 = (KeyedStream<IN1, ?>) inputStream1; KeyedStream<IN2, ?> keyedInput2 = (KeyedStream<IN2, ?>) inputStream2; TypeInformation<?> keyType1 = keyedInput1.getKeyType(); TypeInformation<?> keyType2 = keyedInput2.getKeyType(); if (!(keyType1.canEqual(keyType2) && keyType1.equals(keyType2))) { throw new UnsupportedOperationException(“Key types if input KeyedStreams " + “don’t match: " + keyType1 + " and " + keyType2 + “.”); } transform.setStateKeySelectors(keyedInput1.getKeySelector(), keyedInput2.getKeySelector()); transform.setStateKeyType(keyType1); } @SuppressWarnings({ “unchecked”, “rawtypes” }) SingleOutputStreamOperator<R> returnStream = new SingleOutputStreamOperator(environment, transform); getExecutionEnvironment().addOperator(transform); return returnStream; }}ConnectedStreams提供了keyBy方法用于指定两个stream的keySelector，提供了map、flatMap、process、transform操作，其中前三个操作最后都是调用transform操作transform操作接收TwoInputStreamOperator类型的operator，然后转换为SingleOutputStreamOperatormap操作接收CoMapFunction，flatMap操作接收CoFlatMapFunction，process操作接收CoProcessFunctionCoMapFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/co/CoMapFunction.java@Publicpublic interface CoMapFunction<IN1, IN2, OUT> extends Function, Serializable { OUT map1(IN1 value) throws Exception; OUT map2(IN2 value) throws Exception;}CoMapFunction继承了Function，它定义了map1、map2方法CoFlatMapFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/co/CoFlatMapFunction.java@Publicpublic interface CoFlatMapFunction<IN1, IN2, OUT> extends Function, Serializable { void flatMap1(IN1 value, Collector<OUT> out) throws Exception; void flatMap2(IN2 value, Collector<OUT> out) throws Exception;}CoFlatMapFunction继承了Function，它定义了map1、map2方法，与CoMapFunction不同的是，CoFlatMapFunction的map1、map2方法多了Collector参数CoProcessFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/co/CoProcessFunction.java@PublicEvolvingpublic abstract class CoProcessFunction<IN1, IN2, OUT> extends AbstractRichFunction { private static final long serialVersionUID = 1L; public abstract void processElement1(IN1 value, Context ctx, Collector<OUT> out) throws Exception; public abstract void processElement2(IN2 value, Context ctx, Collector<OUT> out) throws Exception; public void onTimer(long timestamp, OnTimerContext ctx, Collector<OUT> out) throws Exception {} public abstract class Context { public abstract Long timestamp(); public abstract TimerService timerService(); public abstract <X> void output(OutputTag<X> outputTag, X value); } public abstract class OnTimerContext extends Context { /** * The {@link TimeDomain} of the firing timer. */ public abstract TimeDomain timeDomain(); }}CoProcessFunction继承了AbstractRichFunction，它定义了processElement1、processElement2方法，与CoFlatMapFunction不同的是，它定义的这两个方法多了Context参数CoProcessFunction定义了Context及OnTimerContext，在processElement1、processElement2方法可以访问到Context，Context提供了timestamp、timerService、output方法CoProcessFunction与CoFlatMapFunction不同的另外一点是它可以使用TimerService来注册timer，然后在onTimer方法里头实现响应的逻辑小结DataStream的connect操作创建的是ConnectedStreams或BroadcastConnectedStream，它用了两个泛型，即不要求两个dataStream的element是同一类型ConnectedStreams提供了keyBy方法用于指定两个stream的keySelector，提供了map、flatMap、process、transform操作，其中前三个操作最后都是调用transform操作；transform操作接收TwoInputStreamOperator类型的operator，然后转换为SingleOutputStreamOperator；map操作接收CoMapFunction，flatMap操作接收CoFlatMapFunction，process操作接收CoProcessFunctionCoFlatMapFunction与CoMapFunction不同的是，CoFlatMapFunction的map1、map2方法多了Collector参数；CoProcessFunction定义了processElement1、processElement2方法，与CoFlatMapFunction不同的是，它定义的这两个方法多了Context参数；CoProcessFunction与CoFlatMapFunction不同的另外一点是它可以使用TimerService来注册timer，然后在onTimer方法里头实现响应的逻辑docDataStream Transformations ...

聊聊flink DataStream的window coGroup操作

序本文主要研究一下flink DataStream的window coGroup操作实例dataStream.coGroup(otherStream) .where(0).equalTo(1) .window(TumblingEventTimeWindows.of(Time.seconds(3))) .apply (new CoGroupFunction () {…});这里展示了DataStream的window coGroup操作的基本用法DataStream.coGroupflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java@Publicpublic class DataStream<T> { //…… public <T2> CoGroupedStreams<T, T2> coGroup(DataStream<T2> otherStream) { return new CoGroupedStreams<>(this, otherStream); } //……}DataStream的coGroup操作创建的是CoGroupedStreamsCoGroupedStreamsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java@Publicpublic class CoGroupedStreams<T1, T2> { private final DataStream<T1> input1; private final DataStream<T2> input2; public CoGroupedStreams(DataStream<T1> input1, DataStream<T2> input2) { this.input1 = requireNonNull(input1); this.input2 = requireNonNull(input2); } public <KEY> Where<KEY> where(KeySelector<T1, KEY> keySelector) { Preconditions.checkNotNull(keySelector); final TypeInformation<KEY> keyType = TypeExtractor.getKeySelectorTypes(keySelector, input1.getType()); return where(keySelector, keyType); } public <KEY> Where<KEY> where(KeySelector<T1, KEY> keySelector, TypeInformation<KEY> keyType) { Preconditions.checkNotNull(keySelector); Preconditions.checkNotNull(keyType); return new Where<>(input1.clean(keySelector), keyType); } //…….}CoGroupedStreams提供了where操作，用于指定input1的keySelector，它创建并返回Where对象Whereflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java @Public public class Where<KEY> { private final KeySelector<T1, KEY> keySelector1; private final TypeInformation<KEY> keyType; Where(KeySelector<T1, KEY> keySelector1, TypeInformation<KEY> keyType) { this.keySelector1 = keySelector1; this.keyType = keyType; } public EqualTo equalTo(KeySelector<T2, KEY> keySelector) { Preconditions.checkNotNull(keySelector); final TypeInformation<KEY> otherKey = TypeExtractor.getKeySelectorTypes(keySelector, input2.getType()); return equalTo(keySelector, otherKey); } public EqualTo equalTo(KeySelector<T2, KEY> keySelector, TypeInformation<KEY> keyType) { Preconditions.checkNotNull(keySelector); Preconditions.checkNotNull(keyType); if (!keyType.equals(this.keyType)) { throw new IllegalArgumentException(“The keys for the two inputs are not equal: " + “first key = " + this.keyType + " , second key = " + keyType); } return new EqualTo(input2.clean(keySelector)); } //…… } Where对象提供了equalTo操作，用于指定input2的keySelector，它创建并返回EqualTo对象EqualToflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java @Public public class EqualTo { private final KeySelector<T2, KEY> keySelector2; EqualTo(KeySelector<T2, KEY> keySelector2) { this.keySelector2 = requireNonNull(keySelector2); } @PublicEvolving public <W extends Window> WithWindow<T1, T2, KEY, W> window(WindowAssigner<? super TaggedUnion<T1, T2>, W> assigner) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, assigner, null, null, null); } }EqualTo对象提供了window操作，它创建并返回WithWindow对象WithWindowflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java @Public public static class WithWindow<T1, T2, KEY, W extends Window> { private final DataStream<T1> input1; private final DataStream<T2> input2; private final KeySelector<T1, KEY> keySelector1; private final KeySelector<T2, KEY> keySelector2; private final TypeInformation<KEY> keyType; private final WindowAssigner<? super TaggedUnion<T1, T2>, W> windowAssigner; private final Trigger<? super TaggedUnion<T1, T2>, ? super W> trigger; private final Evictor<? super TaggedUnion<T1, T2>, ? super W> evictor; private final Time allowedLateness; private WindowedStream<TaggedUnion<T1, T2>, KEY, W> windowedStream; protected WithWindow(DataStream<T1> input1, DataStream<T2> input2, KeySelector<T1, KEY> keySelector1, KeySelector<T2, KEY> keySelector2, TypeInformation<KEY> keyType, WindowAssigner<? super TaggedUnion<T1, T2>, W> windowAssigner, Trigger<? super TaggedUnion<T1, T2>, ? super W> trigger, Evictor<? super TaggedUnion<T1, T2>, ? super W> evictor, Time allowedLateness) { this.input1 = input1; this.input2 = input2; this.keySelector1 = keySelector1; this.keySelector2 = keySelector2; this.keyType = keyType; this.windowAssigner = windowAssigner; this.trigger = trigger; this.evictor = evictor; this.allowedLateness = allowedLateness; } @PublicEvolving public WithWindow<T1, T2, KEY, W> trigger(Trigger<? super TaggedUnion<T1, T2>, ? super W> newTrigger) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, newTrigger, evictor, allowedLateness); } @PublicEvolving public WithWindow<T1, T2, KEY, W> evictor(Evictor<? super TaggedUnion<T1, T2>, ? super W> newEvictor) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, newEvictor, allowedLateness); } @PublicEvolving public WithWindow<T1, T2, KEY, W> allowedLateness(Time newLateness) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, evictor, newLateness); } public <T> DataStream<T> apply(CoGroupFunction<T1, T2, T> function) { TypeInformation<T> resultType = TypeExtractor.getCoGroupReturnTypes( function, input1.getType(), input2.getType(), “CoGroup”, false); return apply(function, resultType); } @PublicEvolving @Deprecated public <T> SingleOutputStreamOperator<T> with(CoGroupFunction<T1, T2, T> function) { return (SingleOutputStreamOperator<T>) apply(function); } public <T> DataStream<T> apply(CoGroupFunction<T1, T2, T> function, TypeInformation<T> resultType) { //clean the closure function = input1.getExecutionEnvironment().clean(function); UnionTypeInfo<T1, T2> unionType = new UnionTypeInfo<>(input1.getType(), input2.getType()); UnionKeySelector<T1, T2, KEY> unionKeySelector = new UnionKeySelector<>(keySelector1, keySelector2); DataStream<TaggedUnion<T1, T2>> taggedInput1 = input1 .map(new Input1Tagger<T1, T2>()) .setParallelism(input1.getParallelism()) .returns(unionType); DataStream<TaggedUnion<T1, T2>> taggedInput2 = input2 .map(new Input2Tagger<T1, T2>()) .setParallelism(input2.getParallelism()) .returns(unionType); DataStream<TaggedUnion<T1, T2>> unionStream = taggedInput1.union(taggedInput2); // we explicitly create the keyed stream to manually pass the key type information in windowedStream = new KeyedStream<TaggedUnion<T1, T2>, KEY>(unionStream, unionKeySelector, keyType) .window(windowAssigner); if (trigger != null) { windowedStream.trigger(trigger); } if (evictor != null) { windowedStream.evictor(evictor); } if (allowedLateness != null) { windowedStream.allowedLateness(allowedLateness); } return windowedStream.apply(new CoGroupWindowFunction<T1, T2, T, KEY, W>(function), resultType); } @PublicEvolving @Deprecated public <T> SingleOutputStreamOperator<T> with(CoGroupFunction<T1, T2, T> function, TypeInformation<T> resultType) { return (SingleOutputStreamOperator<T>) apply(function, resultType); } @VisibleForTesting Time getAllowedLateness() { return allowedLateness; } @VisibleForTesting WindowedStream<TaggedUnion<T1, T2>, KEY, W> getWindowedStream() { return windowedStream; } }WithWindow可以设置windowAssigner、trigger、evictor、allowedLateness，它提供apply操作(with操作被标记为废弃)apply操作接收CoGroupFunction，它内部是先根据两个keySelector创建UnionKeySelector，然后对两个input stream分别使用Input1Tagger及Input2Tagger进行map转换为TaggedUnion对象的stream，然后执行taggedInput1.union(taggedInput2)得到unionStream，之后使用UnionKeySelector将unionStream转换为KeyedStream，之后在对KeyedStream执行window操作，把原来的windowAssigner、trigger、evictor、allowedLateness都赋值过去，最后将用户定义的CoGroupFunction包装为CoGroupWindowFunction，然后调用windowedStream.apply方法可以看到apply操作内部转化的WindowedStream，其element类型为TaggedUnion；WindowedStream使用的KeyedStream，它的KeySelector为UnionKeySelector；而KeyedStream是基于TaggedUnion类型的DataStream，是taggedInput1.union(taggedInput2)操作而来；而taggedInput1及taggedInput2是对原始input stream进行map操作而来，使用的MapFunction分别是Input1Tagger及Input2TaggerCoGroupFunctionflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/functions/CoGroupFunction.java@Public@FunctionalInterfacepublic interface CoGroupFunction<IN1, IN2, O> extends Function, Serializable { void coGroup(Iterable<IN1> first, Iterable<IN2> second, Collector<O> out) throws Exception;}CoGroupFunction继承了Function，它定义了coGroup方法，该方法接收两个Iterable类型的element集合Input1Tagger及Input2Taggerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java private static class Input1Tagger<T1, T2> implements MapFunction<T1, TaggedUnion<T1, T2>> { private static final long serialVersionUID = 1L; @Override public TaggedUnion<T1, T2> map(T1 value) throws Exception { return TaggedUnion.one(value); } } private static class Input2Tagger<T1, T2> implements MapFunction<T2, TaggedUnion<T1, T2>> { private static final long serialVersionUID = 1L; @Override public TaggedUnion<T1, T2> map(T2 value) throws Exception { return TaggedUnion.two(value); } }Input1Tagger及Input2Tagger实现了MapFunction，该map方法返回的类型为TaggedUnionTaggedUnionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java @Internal public static class TaggedUnion<T1, T2> { private final T1 one; private final T2 two; private TaggedUnion(T1 one, T2 two) { this.one = one; this.two = two; } public boolean isOne() { return one != null; } public boolean isTwo() { return two != null; } public T1 getOne() { return one; } public T2 getTwo() { return two; } public static <T1, T2> TaggedUnion<T1, T2> one(T1 one) { return new TaggedUnion<>(one, null); } public static <T1, T2> TaggedUnion<T1, T2> two(T2 two) { return new TaggedUnion<>(null, two); } }TaggedUnion里头有one、two两个属性，它提供了两个静态工厂方法one及two，可以看到TaggedUnion对象要么one为null，要么two为null，不可能两个同时有值UnionKeySelectorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java private static class UnionKeySelector<T1, T2, KEY> implements KeySelector<TaggedUnion<T1, T2>, KEY> { private static final long serialVersionUID = 1L; private final KeySelector<T1, KEY> keySelector1; private final KeySelector<T2, KEY> keySelector2; public UnionKeySelector(KeySelector<T1, KEY> keySelector1, KeySelector<T2, KEY> keySelector2) { this.keySelector1 = keySelector1; this.keySelector2 = keySelector2; } @Override public KEY getKey(TaggedUnion<T1, T2> value) throws Exception{ if (value.isOne()) { return keySelector1.getKey(value.getOne()); } else { return keySelector2.getKey(value.getTwo()); } } }UnionKeySelector有两个KeySelector属性，它的getKey操作根据TaggedUnion来判断，如果是one，则使用keySelector1.getKey(value.getOne())，否则使用keySelector2.getKey(value.getTwo())DataStream.unionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java@Publicpublic class DataStream<T> { //…… @SafeVarargs public final DataStream<T> union(DataStream<T>… streams) { List<StreamTransformation<T>> unionedTransforms = new ArrayList<>(); unionedTransforms.add(this.transformation); for (DataStream<T> newStream : streams) { if (!getType().equals(newStream.getType())) { throw new IllegalArgumentException(“Cannot union streams of different types: " + getType() + " and " + newStream.getType()); } unionedTransforms.add(newStream.getTransformation()); } return new DataStream<>(this.environment, new UnionTransformation<>(unionedTransforms)); } //……}DataStream的union操作，使用UnionTransformation创建了一个新的DataStream；注意union操作需要两个stream使用相同类型的element，这就是为什么WithWindow的apply操作对两个input stream分别使用Input1Tagger及Input2Tagger进行map转换为TaggedUnion对象来统一两个stream的element类型的原因CoGroupWindowFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java private static class CoGroupWindowFunction<T1, T2, T, KEY, W extends Window> extends WrappingFunction<CoGroupFunction<T1, T2, T>> implements WindowFunction<TaggedUnion<T1, T2>, T, KEY, W> { private static final long serialVersionUID = 1L; public CoGroupWindowFunction(CoGroupFunction<T1, T2, T> userFunction) { super(userFunction); } @Override public void apply(KEY key, W window, Iterable<TaggedUnion<T1, T2>> values, Collector<T> out) throws Exception { List<T1> oneValues = new ArrayList<>(); List<T2> twoValues = new ArrayList<>(); for (TaggedUnion<T1, T2> val: values) { if (val.isOne()) { oneValues.add(val.getOne()); } else { twoValues.add(val.getTwo()); } } wrappedFunction.coGroup(oneValues, twoValues, out); } }CoGroupWindowFunction继承了WrappingFunction(WrappingFunction继承了AbstractRichFunction，覆盖了父类的open、close、setRuntimeContext方法，用于管理wrappedFunction)，实现了WindowFunction接口，其apply方法对TaggedUnion类型的Iterable数据进行拆解，分别拆分到oneValues及twoValues中，然后调用用户定义的CoGroupFunction的coGroup方法小结DataStream提供了coGroup方法，用于执行window coGroup操作，它返回的是CoGroupedStreams；CoGroupedStreams主要是提供where操作来构建Where对象；Where对象主要提供equalTo操作用于构建EqualTo对象；EqualTo对象提供window操作用于构建WithWindow对象；WithWindow可以设置windowAssigner、trigger、evictor、allowedLateness，它提供apply操作CoGroupedStreams的WithWindow对象的apply操作接收CoGroupFunction，它内部是先根据两个keySelector创建UnionKeySelector，然后对两个input stream分别使用Input1Tagger及Input2Tagger进行map转换为TaggedUnion对象的stream，然后执行taggedInput1.union(taggedInput2)得到unionStream，之后使用UnionKeySelector将unionStream转换为KeyedStream，之后在对KeyedStream执行window操作，把原来的windowAssigner、trigger、evictor、allowedLateness都赋值过去，最后将用户定义的CoGroupFunction包装为CoGroupWindowFunction，然后调用windowedStream.apply方法CoGroupedStreams的WithWindow对象的apply操作借助了DataStream的union操作类合并两个stream，然后转换为KeyedStream，这里关键的两个类分别是TaggedUnion及UnionKeySelector；TaggedUnion里头有one、two两个属性，它提供了两个静态工厂方法one及two，可以看到TaggedUnion对象要么one为null，要么two为null，不可能两个同时有值；UnionKeySelector有两个KeySelector属性，它的getKey操作根据TaggedUnion来判断，如果是one，则使用keySelector1.getKey(value.getOne())，否则使用keySelector2.getKey(value.getTwo())(借助TaggedUnion类统一两个stream的element类型，然后好执行union操作)CoGroupWindowFunction继承了WrappingFunction(WrappingFunction继承了AbstractRichFunction，覆盖了父类的open、close、setRuntimeContext方法，用于管理wrappedFunction)，实现了WindowFunction接口，其apply方法对TaggedUnion类型的Iterable数据进行拆解，分别拆分到oneValues及twoValues中，然后调用用户定义的CoGroupFunction的coGroup方法CoGroupFunction继承了Function，它定义了coGroup方法，该方法接收两个Iterable类型的element集合；JoinedStreams的WithWindow对象的apply方法内部将JoinFunction或者FlatJoinFunction包装为CoGroupFunction(JoinFunction使用JoinCoGroupFunction包装，FlatJoinFunction使用FlatJoinCoGroupFunction包装)，然后去调用CoGroupedStreams的WithWindow的apply方法；而JoinCoGroupFunction及FlatJoinCoGroupFunction继承了WrappingFunction，同时实现CoGroupFunction接口定义的coGroup方法，默认是遍历第一个集合，对其每个元素遍历第二个集合，挨个执行JoinFunction或FlatJoinFunction的join方法(这里的操作对集合为空的情况不做任何操作，因而实现的就是inner join效果；用户使用coGroup操作可以自定义CoGroupFunction实现outer join)docDataStream Transformations聊聊flink DataStream的join操作Apache Flink using coGroup to achieve left-outer join ...

聊聊flink KeyedStream的intervalJoin操作

序本文主要研究一下flink KeyedStream的intervalJoin操作实例DataStream<Integer> orangeStream = …DataStream<Integer> greenStream = …orangeStream .keyBy(<KeySelector>) .intervalJoin(greenStream.keyBy(<KeySelector>)) .between(Time.milliseconds(-2), Time.milliseconds(1)) .process (new ProcessJoinFunction<Integer, Integer, String(){ @Override public void processElement(Integer left, Integer right, Context ctx, Collector<String> out) { out.collect(first + “,” + second); } });KeyedStream.intervalJoinflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/KeyedStream.java@Publicpublic class KeyedStream<T, KEY> extends DataStream<T> { //…… @PublicEvolving public <T1> IntervalJoin<T, T1, KEY> intervalJoin(KeyedStream<T1, KEY> otherStream) { return new IntervalJoin<>(this, otherStream); } //……}KeyedStream的intervalJoin创建并返回IntervalJoinIntervalJoinflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/KeyedStream.java @PublicEvolving public static class IntervalJoin<T1, T2, KEY> { private final KeyedStream<T1, KEY> streamOne; private final KeyedStream<T2, KEY> streamTwo; IntervalJoin( KeyedStream<T1, KEY> streamOne, KeyedStream<T2, KEY> streamTwo ) { this.streamOne = checkNotNull(streamOne); this.streamTwo = checkNotNull(streamTwo); } @PublicEvolving public IntervalJoined<T1, T2, KEY> between(Time lowerBound, Time upperBound) { TimeCharacteristic timeCharacteristic = streamOne.getExecutionEnvironment().getStreamTimeCharacteristic(); if (timeCharacteristic != TimeCharacteristic.EventTime) { throw new UnsupportedTimeCharacteristicException(“Time-bounded stream joins are only supported in event time”); } checkNotNull(lowerBound, “A lower bound needs to be provided for a time-bounded join”); checkNotNull(upperBound, “An upper bound needs to be provided for a time-bounded join”); return new IntervalJoined<>( streamOne, streamTwo, lowerBound.toMilliseconds(), upperBound.toMilliseconds(), true, true ); } }IntervalJoin提供了between操作，用于设置interval的lowerBound及upperBound，这里可以看到between方法里头对非TimeCharacteristic.EventTime的直接抛出UnsupportedTimeCharacteristicException；between操作创建并返回IntervalJoinedIntervalJoinedflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/KeyedStream.java @PublicEvolving public static class IntervalJoined<IN1, IN2, KEY> { private final KeyedStream<IN1, KEY> left; private final KeyedStream<IN2, KEY> right; private final long lowerBound; private final long upperBound; private final KeySelector<IN1, KEY> keySelector1; private final KeySelector<IN2, KEY> keySelector2; private boolean lowerBoundInclusive; private boolean upperBoundInclusive; public IntervalJoined( KeyedStream<IN1, KEY> left, KeyedStream<IN2, KEY> right, long lowerBound, long upperBound, boolean lowerBoundInclusive, boolean upperBoundInclusive) { this.left = checkNotNull(left); this.right = checkNotNull(right); this.lowerBound = lowerBound; this.upperBound = upperBound; this.lowerBoundInclusive = lowerBoundInclusive; this.upperBoundInclusive = upperBoundInclusive; this.keySelector1 = left.getKeySelector(); this.keySelector2 = right.getKeySelector(); } @PublicEvolving public IntervalJoined<IN1, IN2, KEY> upperBoundExclusive() { this.upperBoundInclusive = false; return this; } @PublicEvolving public IntervalJoined<IN1, IN2, KEY> lowerBoundExclusive() { this.lowerBoundInclusive = false; return this; } @PublicEvolving public <OUT> SingleOutputStreamOperator<OUT> process(ProcessJoinFunction<IN1, IN2, OUT> processJoinFunction) { Preconditions.checkNotNull(processJoinFunction); final TypeInformation<OUT> outputType = TypeExtractor.getBinaryOperatorReturnType( processJoinFunction, ProcessJoinFunction.class, 0, 1, 2, TypeExtractor.NO_INDEX, left.getType(), right.getType(), Utils.getCallLocationName(), true ); return process(processJoinFunction, outputType); } @PublicEvolving public <OUT> SingleOutputStreamOperator<OUT> process( ProcessJoinFunction<IN1, IN2, OUT> processJoinFunction, TypeInformation<OUT> outputType) { Preconditions.checkNotNull(processJoinFunction); Preconditions.checkNotNull(outputType); final ProcessJoinFunction<IN1, IN2, OUT> cleanedUdf = left.getExecutionEnvironment().clean(processJoinFunction); final IntervalJoinOperator<KEY, IN1, IN2, OUT> operator = new IntervalJoinOperator<>( lowerBound, upperBound, lowerBoundInclusive, upperBoundInclusive, left.getType().createSerializer(left.getExecutionConfig()), right.getType().createSerializer(right.getExecutionConfig()), cleanedUdf ); return left .connect(right) .keyBy(keySelector1, keySelector2) .transform(“Interval Join”, outputType, operator); } }IntervalJoined默认对lowerBound及upperBound是inclusive的，它也提供了lowerBoundExclusive、upperBoundExclusive来单独设置为exclusive；IntervalJoined提供了process操作，接收的是ProcessJoinFunction；process操作里头创建了IntervalJoinOperator，然后执行left.connect(right).keyBy(keySelector1, keySelector2).transform(“Interval Join”, outputType, operator)，返回的是SingleOutputStreamOperator(本实例left为orangeStream，right为greenStream)ProcessJoinFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/co/ProcessJoinFunction.java@PublicEvolvingpublic abstract class ProcessJoinFunction<IN1, IN2, OUT> extends AbstractRichFunction { private static final long serialVersionUID = -2444626938039012398L; public abstract void processElement(IN1 left, IN2 right, Context ctx, Collector<OUT> out) throws Exception; public abstract class Context { public abstract long getLeftTimestamp(); public abstract long getRightTimestamp(); public abstract long getTimestamp(); public abstract <X> void output(OutputTag<X> outputTag, X value); }}ProcessJoinFunction继承了AbstractRichFunction，它定义了processElement抽象方法，同时也定义了自身的Context对象，该对象定义了getLeftTimestamp、getRightTimestamp、getTimestamp、output四个抽象方法IntervalJoinOperatorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/co/IntervalJoinOperator.java@Internalpublic class IntervalJoinOperator<K, T1, T2, OUT> extends AbstractUdfStreamOperator<OUT, ProcessJoinFunction<T1, T2, OUT>> implements TwoInputStreamOperator<T1, T2, OUT>, Triggerable<K, String> { private static final long serialVersionUID = -5380774605111543454L; private static final Logger logger = LoggerFactory.getLogger(IntervalJoinOperator.class); private static final String LEFT_BUFFER = “LEFT_BUFFER”; private static final String RIGHT_BUFFER = “RIGHT_BUFFER”; private static final String CLEANUP_TIMER_NAME = “CLEANUP_TIMER”; private static final String CLEANUP_NAMESPACE_LEFT = “CLEANUP_LEFT”; private static final String CLEANUP_NAMESPACE_RIGHT = “CLEANUP_RIGHT”; private final long lowerBound; private final long upperBound; private final TypeSerializer<T1> leftTypeSerializer; private final TypeSerializer<T2> rightTypeSerializer; private transient MapState<Long, List<BufferEntry<T1>>> leftBuffer; private transient MapState<Long, List<BufferEntry<T2>>> rightBuffer; private transient TimestampedCollector<OUT> collector; private transient ContextImpl context; private transient InternalTimerService<String> internalTimerService; public IntervalJoinOperator( long lowerBound, long upperBound, boolean lowerBoundInclusive, boolean upperBoundInclusive, TypeSerializer<T1> leftTypeSerializer, TypeSerializer<T2> rightTypeSerializer, ProcessJoinFunction<T1, T2, OUT> udf) { super(Preconditions.checkNotNull(udf)); Preconditions.checkArgument(lowerBound <= upperBound, “lowerBound <= upperBound must be fulfilled”); // Move buffer by +1 / -1 depending on inclusiveness in order not needing // to check for inclusiveness later on this.lowerBound = (lowerBoundInclusive) ? lowerBound : lowerBound + 1L; this.upperBound = (upperBoundInclusive) ? upperBound : upperBound - 1L; this.leftTypeSerializer = Preconditions.checkNotNull(leftTypeSerializer); this.rightTypeSerializer = Preconditions.checkNotNull(rightTypeSerializer); } @Override public void open() throws Exception { super.open(); collector = new TimestampedCollector<>(output); context = new ContextImpl(userFunction); internalTimerService = getInternalTimerService(CLEANUP_TIMER_NAME, StringSerializer.INSTANCE, this); } @Override public void initializeState(StateInitializationContext context) throws Exception { super.initializeState(context); this.leftBuffer = context.getKeyedStateStore().getMapState(new MapStateDescriptor<>( LEFT_BUFFER, LongSerializer.INSTANCE, new ListSeriawelizer<>(new BufferEntrySerializer<>(leftTypeSerializer)) )); this.rightBuffer = context.getKeyedStateStore().getMapState(new MapStateDescriptor<>( RIGHT_BUFFER, LongSerializer.INSTANCE, new ListSerializer<>(new BufferEntrySerializer<>(rightTypeSerializer)) )); } @Override public void processElement1(StreamRecord<T1> record) throws Exception { processElement(record, leftBuffer, rightBuffer, lowerBound, upperBound, true); } @Override public void processElement2(StreamRecord<T2> record) throws Exception { processElement(record, rightBuffer, leftBuffer, -upperBound, -lowerBound, false); } @SuppressWarnings(“unchecked”) private <THIS, OTHER> void processElement( final StreamRecord<THIS> record, final MapState<Long, List<IntervalJoinOperator.BufferEntry<THIS>>> ourBuffer, final MapState<Long, List<IntervalJoinOperator.BufferEntry<OTHER>>> otherBuffer, final long relativeLowerBound, final long relativeUpperBound, final boolean isLeft) throws Exception { final THIS ourValue = record.getValue(); final long ourTimestamp = record.getTimestamp(); if (ourTimestamp == Long.MIN_VALUE) { throw new FlinkException(“Long.MIN_VALUE timestamp: Elements used in " + “interval stream joins need to have timestamps meaningful timestamps.”); } if (isLate(ourTimestamp)) { return; } addToBuffer(ourBuffer, ourValue, ourTimestamp); for (Map.Entry<Long, List<BufferEntry<OTHER>>> bucket: otherBuffer.entries()) { final long timestamp = bucket.getKey(); if (timestamp < ourTimestamp + relativeLowerBound || timestamp > ourTimestamp + relativeUpperBound) { continue; } for (BufferEntry<OTHER> entry: bucket.getValue()) { if (isLeft) { collect((T1) ourValue, (T2) entry.element, ourTimestamp, timestamp); } else { collect((T1) entry.element, (T2) ourValue, timestamp, ourTimestamp); } } } long cleanupTime = (relativeUpperBound > 0L) ? ourTimestamp + relativeUpperBound : ourTimestamp; if (isLeft) { internalTimerService.registerEventTimeTimer(CLEANUP_NAMESPACE_LEFT, cleanupTime); } else { internalTimerService.registerEventTimeTimer(CLEANUP_NAMESPACE_RIGHT, cleanupTime); } } private boolean isLate(long timestamp) { long currentWatermark = internalTimerService.currentWatermark(); return currentWatermark != Long.MIN_VALUE && timestamp < currentWatermark; } private void collect(T1 left, T2 right, long leftTimestamp, long rightTimestamp) throws Exception { final long resultTimestamp = Math.max(leftTimestamp, rightTimestamp); collector.setAbsoluteTimestamp(resultTimestamp); context.updateTimestamps(leftTimestamp, rightTimestamp, resultTimestamp); userFunction.processElement(left, right, context, collector); } @Override public void onEventTime(InternalTimer<K, String> timer) throws Exception { long timerTimestamp = timer.getTimestamp(); String namespace = timer.getNamespace(); logger.trace(“onEventTime @ {}”, timerTimestamp); switch (namespace) { case CLEANUP_NAMESPACE_LEFT: { long timestamp = (upperBound <= 0L) ? timerTimestamp : timerTimestamp - upperBound; logger.trace(“Removing from left buffer @ {}”, timestamp); leftBuffer.remove(timestamp); break; } case CLEANUP_NAMESPACE_RIGHT: { long timestamp = (lowerBound <= 0L) ? timerTimestamp + lowerBound : timerTimestamp; logger.trace(“Removing from right buffer @ {}”, timestamp); rightBuffer.remove(timestamp); break; } default: throw new RuntimeException(“Invalid namespace " + namespace); } } @Override public void onProcessingTime(InternalTimer<K, String> timer) throws Exception { // do nothing. } //……}IntervalJoinOperator继承了AbstractUdfStreamOperator抽象类，实现了TwoInputStreamOperator及Triggerable接口IntervalJoinOperator覆盖了AbstractUdfStreamOperator(StreamOperator定义)的open、initializeState方法，它在open方法里头创建了InternalTimerService，传递的Triggerable参数为this，即自身实现的Triggerable接口；在initializeState方法里头创建了leftBuffer和rightBuffer两个MapStateIntervalJoinOperator实现了TwoInputStreamOperator接口定义的processElement1、processElement2方法(TwoInputStreamOperator接口定义的其他一些方法在AbstractUdfStreamOperator的父类AbstractStreamOperator中有实现)；processElement1、processElement2方法内部都调用了processElement方法，只是传递的relativeLowerBound、relativeUpperBound、isLeft参数不同以及leftBuffer和rightBuffer的传参顺序不同processElement方法里头实现了intervalJoin的时间匹配逻辑，它会从internalTimerService获取currentWatermark，然后判断element是否late，如果late直接返回，否则继续往下执行；之后就是把element的value添加到ourBuffer中(对于processElement1来说ourBuffer为leftBuffer，对于processElement2来说ourBuffer为rightBuffer)；之后就是遍历otherBuffer中的每个元素，挨个判断时间是否满足要求(即ourTimestamp + relativeLowerBound <= timestamp <= ourTimestamp + relativeUpperBound)，不满足要求的直接跳过，满足要求的就调用collect方法(collect方法里头执行的是userFunction.processElement，即调用用户定义的ProcessJoinFunction的processElement方法)；之后就是计算cleanupTime，调用internalTimerService.registerEventTimeTimer注册清理该element的timerIntervalJoinOperator实现了Triggerable接口定义的onEventTime及onProcessingTime方法，其中onProcessingTime不做任何操作，而onEventTime则会根据timestamp清理leftBuffer或者rightBuffer中的element小结flink的intervalJoin操作要求是KeyedStream，而且必须是TimeCharacteristic.EventTime；KeyedStream的intervalJoin创建并返回IntervalJoin；IntervalJoin提供了between操作，用于设置interval的lowerBound及upperBound，该操作创建并返回IntervalJoinedIntervalJoined提供了process操作，接收的是ProcessJoinFunction；process操作里头创建了IntervalJoinOperator，然后执行left.connect(right).keyBy(keySelector1, keySelector2).transform(“Interval Join”, outputType, operator)，返回的是SingleOutputStreamOperatorIntervalJoinOperator继承了AbstractUdfStreamOperator抽象类，实现了TwoInputStreamOperator及Triggerable接口；它覆盖了AbstractUdfStreamOperator(StreamOperator定义)的open、initializeState方法，它在open方法里头创建了InternalTimerService，传递的Triggerable参数为this，即自身实现的Triggerable接口；在initializeState方法里头创建了leftBuffer和rightBuffer两个MapState；它实现了TwoInputStreamOperator接口定义的processElement1、processElement2方法，processElement1、processElement2方法内部都调用了processElement方法，只是传递的relativeLowerBound、relativeUpperBound、isLeft参数不同以及leftBuffer和rightBuffer的传参顺序不同IntervalJoinOperator的processElement方法里头实现了intervalJoin的时间匹配逻辑，它首先判断element是否late，如果late直接返回，之后将element添加到buffer中，然后对之后就是遍历otherBuffer中的每个元素，挨个判断时间是否满足要求(即ourTimestamp + relativeLowerBound <= timestamp <= ourTimestamp + relativeUpperBound)，不满足要求的直接跳过，满足要求的就调用collect方法(collect方法里头执行的是userFunction.processElement，即调用用户定义的ProcessJoinFunction的processElement方法)；之后就是计算cleanupTime，调用internalTimerService.registerEventTimeTimer注册清理该element的timerIntervalJoinOperator实现了Triggerable接口定义的onEventTime及onProcessingTime方法，其中onProcessingTime不做任何操作，而onEventTime则会根据timestamp清理leftBuffer或者rightBuffer中的elementdocInterval Join ...

聊聊flink DataStream的join操作

序本文主要研究一下flink DataStream的join操作实例stream.join(otherStream) .where(<KeySelector>) .equalTo(<KeySelector>) .window(<WindowAssigner>) .apply(<JoinFunction>)这里首先调用join，与另外一个stream合并，返回的是JoinedStreams，之后就可以调用JoinedStreams的where操作来构建Where对象构造条件；Where有equalTo操作可以构造EqualTo，而EqualTo有window操作可以构造WithWindow，而WithWindow可以设置windowAssigner、trigger、evictor、allowedLateness，它提供apply操作DataStream.joinflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java@Publicpublic class DataStream<T> { //…… /** * Creates a join operation. See {@link JoinedStreams} for an example of how the keys * and window can be specified. / public <T2> JoinedStreams<T, T2> join(DataStream<T2> otherStream) { return new JoinedStreams<>(this, otherStream); } //……}DataStream提供了join方法，用于执行join操作，它返回的是JoinedStreamsJoinedStreamsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/JoinedStreams.java@Publicpublic class JoinedStreams<T1, T2> { /* The first input stream. / private final DataStream<T1> input1; /* The second input stream. / private final DataStream<T2> input2; public JoinedStreams(DataStream<T1> input1, DataStream<T2> input2) { this.input1 = requireNonNull(input1); this.input2 = requireNonNull(input2); } public <KEY> Where<KEY> where(KeySelector<T1, KEY> keySelector) { requireNonNull(keySelector); final TypeInformation<KEY> keyType = TypeExtractor.getKeySelectorTypes(keySelector, input1.getType()); return where(keySelector, keyType); } public <KEY> Where<KEY> where(KeySelector<T1, KEY> keySelector, TypeInformation<KEY> keyType) { requireNonNull(keySelector); requireNonNull(keyType); return new Where<>(input1.clean(keySelector), keyType); } //……}JoinedStreams主要是提供where操作来构建Where对象Whereflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/JoinedStreams.java @Public public class Where<KEY> { private final KeySelector<T1, KEY> keySelector1; private final TypeInformation<KEY> keyType; Where(KeySelector<T1, KEY> keySelector1, TypeInformation<KEY> keyType) { this.keySelector1 = keySelector1; this.keyType = keyType; } public EqualTo equalTo(KeySelector<T2, KEY> keySelector) { requireNonNull(keySelector); final TypeInformation<KEY> otherKey = TypeExtractor.getKeySelectorTypes(keySelector, input2.getType()); return equalTo(keySelector, otherKey); } public EqualTo equalTo(KeySelector<T2, KEY> keySelector, TypeInformation<KEY> keyType) { requireNonNull(keySelector); requireNonNull(keyType); if (!keyType.equals(this.keyType)) { throw new IllegalArgumentException(“The keys for the two inputs are not equal: " + “first key = " + this.keyType + " , second key = " + keyType); } return new EqualTo(input2.clean(keySelector)); } //…… }Where对象主要提供equalTo操作用于构建EqualTo对象EqualToflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/JoinedStreams.java @Public public class EqualTo { private final KeySelector<T2, KEY> keySelector2; EqualTo(KeySelector<T2, KEY> keySelector2) { this.keySelector2 = requireNonNull(keySelector2); } /* * Specifies the window on which the join operation works. / @PublicEvolving public <W extends Window> WithWindow<T1, T2, KEY, W> window(WindowAssigner<? super TaggedUnion<T1, T2>, W> assigner) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, assigner, null, null, null); } }EqualTo对象提供window操作用于构建WithWindow对象WithWindow/flink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/JoinedStreams.java @Public public static class WithWindow<T1, T2, KEY, W extends Window> { private final DataStream<T1> input1; private final DataStream<T2> input2; private final KeySelector<T1, KEY> keySelector1; private final KeySelector<T2, KEY> keySelector2; private final TypeInformation<KEY> keyType; private final WindowAssigner<? super TaggedUnion<T1, T2>, W> windowAssigner; private final Trigger<? super TaggedUnion<T1, T2>, ? super W> trigger; private final Evictor<? super TaggedUnion<T1, T2>, ? super W> evictor; private final Time allowedLateness; private CoGroupedStreams.WithWindow<T1, T2, KEY, W> coGroupedWindowedStream; @PublicEvolving protected WithWindow(DataStream<T1> input1, DataStream<T2> input2, KeySelector<T1, KEY> keySelector1, KeySelector<T2, KEY> keySelector2, TypeInformation<KEY> keyType, WindowAssigner<? super TaggedUnion<T1, T2>, W> windowAssigner, Trigger<? super TaggedUnion<T1, T2>, ? super W> trigger, Evictor<? super TaggedUnion<T1, T2>, ? super W> evictor, Time allowedLateness) { this.input1 = requireNonNull(input1); this.input2 = requireNonNull(input2); this.keySelector1 = requireNonNull(keySelector1); this.keySelector2 = requireNonNull(keySelector2); this.keyType = requireNonNull(keyType); this.windowAssigner = requireNonNull(windowAssigner); this.trigger = trigger; this.evictor = evictor; this.allowedLateness = allowedLateness; } @PublicEvolving public WithWindow<T1, T2, KEY, W> trigger(Trigger<? super TaggedUnion<T1, T2>, ? super W> newTrigger) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, newTrigger, evictor, allowedLateness); } @PublicEvolving public WithWindow<T1, T2, KEY, W> evictor(Evictor<? super TaggedUnion<T1, T2>, ? super W> newEvictor) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, newEvictor, allowedLateness); } @PublicEvolving public WithWindow<T1, T2, KEY, W> allowedLateness(Time newLateness) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, evictor, newLateness); } public <T> DataStream<T> apply(JoinFunction<T1, T2, T> function) { TypeInformation<T> resultType = TypeExtractor.getBinaryOperatorReturnType( function, JoinFunction.class, 0, 1, 2, TypeExtractor.NO_INDEX, input1.getType(), input2.getType(), “Join”, false); return apply(function, resultType); } @PublicEvolving @Deprecated public <T> SingleOutputStreamOperator<T> with(JoinFunction<T1, T2, T> function) { return (SingleOutputStreamOperator<T>) apply(function); } public <T> DataStream<T> apply(FlatJoinFunction<T1, T2, T> function, TypeInformation<T> resultType) { //clean the closure function = input1.getExecutionEnvironment().clean(function); coGroupedWindowedStream = input1.coGroup(input2) .where(keySelector1) .equalTo(keySelector2) .window(windowAssigner) .trigger(trigger) .evictor(evictor) .allowedLateness(allowedLateness); return coGroupedWindowedStream .apply(new FlatJoinCoGroupFunction<>(function), resultType); } @PublicEvolving @Deprecated public <T> SingleOutputStreamOperator<T> with(FlatJoinFunction<T1, T2, T> function, TypeInformation<T> resultType) { return (SingleOutputStreamOperator<T>) apply(function, resultType); } public <T> DataStream<T> apply(FlatJoinFunction<T1, T2, T> function) { TypeInformation<T> resultType = TypeExtractor.getBinaryOperatorReturnType( function, FlatJoinFunction.class, 0, 1, 2, new int[]{2, 0}, input1.getType(), input2.getType(), “Join”, false); return apply(function, resultType); } @PublicEvolving @Deprecated public <T> SingleOutputStreamOperator<T> with(FlatJoinFunction<T1, T2, T> function) { return (SingleOutputStreamOperator<T>) apply(function); } public <T> DataStream<T> apply(JoinFunction<T1, T2, T> function, TypeInformation<T> resultType) { //clean the closure function = input1.getExecutionEnvironment().clean(function); coGroupedWindowedStream = input1.coGroup(input2) .where(keySelector1) .equalTo(keySelector2) .window(windowAssigner) .trigger(trigger) .evictor(evictor) .allowedLateness(allowedLateness); return coGroupedWindowedStream .apply(new JoinCoGroupFunction<>(function), resultType); } @PublicEvolving @Deprecated public <T> SingleOutputStreamOperator<T> with(JoinFunction<T1, T2, T> function, TypeInformation<T> resultType) { return (SingleOutputStreamOperator<T>) apply(function, resultType); } @VisibleForTesting Time getAllowedLateness() { return allowedLateness; } @VisibleForTesting CoGroupedStreams.WithWindow<T1, T2, KEY, W> getCoGroupedWindowedStream() { return coGroupedWindowedStream; } }WithWindow可以设置windowAssigner、trigger、evictor、allowedLateness，它提供apply操作(with操作被标记为废弃)apply操作可以接收JoinFunction或者FlatJoinFunction，它内部是使用DataStream的coGroup方法创建CoGroupedStreams，之后将自身的where及equalTo的keySelector、windowAssigner、trigger、evictor、allowedLateness都设置给CoGroupedStreams，最后调用CoGroupedStreams的WithWindow对象的apply方法CoGroupedStreams的WithWindow对象的apply方法与JoinedStreams的WithWindow对象的apply方法参数不同，CoGroupedStreams的WithWindow的apply方法接收的是CoGroupFunction，因而JoinedStreams的WithWindow对象的apply方法内部将JoinFunction或者FlatJoinFunction包装为CoGroupFunction(JoinFunction使用JoinCoGroupFunction包装，FlatJoinFunction使用FlatJoinCoGroupFunction包装)传递给CoGroupedStreams的WithWindow的apply方法JoinFunctionflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/functions/JoinFunction.java@Public@FunctionalInterfacepublic interface JoinFunction<IN1, IN2, OUT> extends Function, Serializable { /* * The join method, called once per joined pair of elements. * * @param first The element from first input. * @param second The element from second input. * @return The resulting element. * * @throws Exception This method may throw exceptions. Throwing an exception will cause the operation * to fail and may trigger recovery. / OUT join(IN1 first, IN2 second) throws Exception;}JoinFunction继承了Function、Serializable，它定义了join操作，默认是inner join的语义，如果需要outer join，可以使用CoGroupFunctionFlatJoinFunctionflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/functions/FlatJoinFunction.java@Public@FunctionalInterfacepublic interface FlatJoinFunction<IN1, IN2, OUT> extends Function, Serializable { /* * The join method, called once per joined pair of elements. * * @param first The element from first input. * @param second The element from second input. * @param out The collector used to return zero, one, or more elements. * * @throws Exception This method may throw exceptions. Throwing an exception will cause the operation * to fail and may trigger recovery. / void join (IN1 first, IN2 second, Collector<OUT> out) throws Exception;}FlatJoinFunction继承了Function、Serializable，它定义了join操作，默认是inner join的语义，如果需要outer join，可以使用CoGroupFunction；与JoinFunction的join方法不同，FlatJoinFunction的join方法多了Collector参数，可以用来发射0条、1条或者多条数据，所以是Flat命名CoGroupedStreamsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/CoGroupedStreams.java@Publicpublic class CoGroupedStreams<T1, T2> { //……@Public public static class WithWindow<T1, T2, KEY, W extends Window> { private final DataStream<T1> input1; private final DataStream<T2> input2; private final KeySelector<T1, KEY> keySelector1; private final KeySelector<T2, KEY> keySelector2; private final TypeInformation<KEY> keyType; private final WindowAssigner<? super TaggedUnion<T1, T2>, W> windowAssigner; private final Trigger<? super TaggedUnion<T1, T2>, ? super W> trigger; private final Evictor<? super TaggedUnion<T1, T2>, ? super W> evictor; private final Time allowedLateness; private WindowedStream<TaggedUnion<T1, T2>, KEY, W> windowedStream; protected WithWindow(DataStream<T1> input1, DataStream<T2> input2, KeySelector<T1, KEY> keySelector1, KeySelector<T2, KEY> keySelector2, TypeInformation<KEY> keyType, WindowAssigner<? super TaggedUnion<T1, T2>, W> windowAssigner, Trigger<? super TaggedUnion<T1, T2>, ? super W> trigger, Evictor<? super TaggedUnion<T1, T2>, ? super W> evictor, Time allowedLateness) { this.input1 = input1; this.input2 = input2; this.keySelector1 = keySelector1; this.keySelector2 = keySelector2; this.keyType = keyType; this.windowAssigner = windowAssigner; this.trigger = trigger; this.evictor = evictor; this.allowedLateness = allowedLateness; } @PublicEvolving public WithWindow<T1, T2, KEY, W> trigger(Trigger<? super TaggedUnion<T1, T2>, ? super W> newTrigger) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, newTrigger, evictor, allowedLateness); } @PublicEvolving public WithWindow<T1, T2, KEY, W> evictor(Evictor<? super TaggedUnion<T1, T2>, ? super W> newEvictor) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, newEvictor, allowedLateness); } @PublicEvolving public WithWindow<T1, T2, KEY, W> allowedLateness(Time newLateness) { return new WithWindow<>(input1, input2, keySelector1, keySelector2, keyType, windowAssigner, trigger, evictor, newLateness); } public <T> DataStream<T> apply(CoGroupFunction<T1, T2, T> function) { TypeInformation<T> resultType = TypeExtractor.getCoGroupReturnTypes( function, input1.getType(), input2.getType(), “CoGroup”, false); return apply(function, resultType); } public <T> DataStream<T> apply(CoGroupFunction<T1, T2, T> function, TypeInformation<T> resultType) { //clean the closure function = input1.getExecutionEnvironment().clean(function); UnionTypeInfo<T1, T2> unionType = new UnionTypeInfo<>(input1.getType(), input2.getType()); UnionKeySelector<T1, T2, KEY> unionKeySelector = new UnionKeySelector<>(keySelector1, keySelector2); DataStream<TaggedUnion<T1, T2>> taggedInput1 = input1 .map(new Input1Tagger<T1, T2>()) .setParallelism(input1.getParallelism()) .returns(unionType); DataStream<TaggedUnion<T1, T2>> taggedInput2 = input2 .map(new Input2Tagger<T1, T2>()) .setParallelism(input2.getParallelism()) .returns(unionType); DataStream<TaggedUnion<T1, T2>> unionStream = taggedInput1.union(taggedInput2); // we explicitly create the keyed stream to manually pass the key type information in windowedStream = new KeyedStream<TaggedUnion<T1, T2>, KEY>(unionStream, unionKeySelector, keyType) .window(windowAssigner); if (trigger != null) { windowedStream.trigger(trigger); } if (evictor != null) { windowedStream.evictor(evictor); } if (allowedLateness != null) { windowedStream.allowedLateness(allowedLateness); } return windowedStream.apply(new CoGroupWindowFunction<T1, T2, T, KEY, W>(function), resultType); } //…… } //……}CoGroupedStreams的整体类结构跟JoinedStreams很像，CoGroupedStreams提供where操作来构建Where对象；Where对象主要提供equalTo操作用于构建EqualTo对象；EqualTo对象提供window操作用于构建WithWindow对象；WithWindow可以设置windowAssigner、trigger、evictor、allowedLateness，它提供apply操作；其中一个不同的地方是CoGroupedStreams定义的WithWindow对象的apply操作接收的Function是CoGroupFunction类型，而JoinedStreams定义的WithWindow对象的apply操作接收的Function类型是JoinFunction或FlatJoinFunctionCoGroupFunctionflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/functions/CoGroupFunction.java@Public@FunctionalInterfacepublic interface CoGroupFunction<IN1, IN2, O> extends Function, Serializable { /* * This method must be implemented to provide a user implementation of a * coGroup. It is called for each pair of element groups where the elements share the * same key. * * @param first The records from the first input. * @param second The records from the second. * @param out A collector to return elements. * * @throws Exception The function may throw Exceptions, which will cause the program to cancel, * and may trigger the recovery logic. / void coGroup(Iterable<IN1> first, Iterable<IN2> second, Collector<O> out) throws Exception;}CoGroupFunction继承了Function、Serializable，它定义了coGroup操作，可以用来实现outer join，其参数使用的是Iterable，而JoinFunction与FlatJoinFunction的join参数使用的是单个对象类型WrappingFunctionflink-java-1.7.0-sources.jar!/org/apache/flink/api/java/operators/translation/WrappingFunction.java@Internalpublic abstract class WrappingFunction<T extends Function> extends AbstractRichFunction { private static final long serialVersionUID = 1L; protected T wrappedFunction; protected WrappingFunction(T wrappedFunction) { this.wrappedFunction = wrappedFunction; } @Override public void open(Configuration parameters) throws Exception { FunctionUtils.openFunction(this.wrappedFunction, parameters); } @Override public void close() throws Exception { FunctionUtils.closeFunction(this.wrappedFunction); } @Override public void setRuntimeContext(RuntimeContext t) { super.setRuntimeContext(t); FunctionUtils.setFunctionRuntimeContext(this.wrappedFunction, t); } public T getWrappedFunction () { return this.wrappedFunction; }}WrappingFunction继承了AbstractRichFunction，这里它覆盖了父类的open、close、setRuntimeContext方法，用于管理wrappedFunctionJoinCoGroupFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/JoinedStreams.java /* * CoGroup function that does a nested-loop join to get the join result. / private static class JoinCoGroupFunction<T1, T2, T> extends WrappingFunction<JoinFunction<T1, T2, T>> implements CoGroupFunction<T1, T2, T> { private static final long serialVersionUID = 1L; public JoinCoGroupFunction(JoinFunction<T1, T2, T> wrappedFunction) { super(wrappedFunction); } @Override public void coGroup(Iterable<T1> first, Iterable<T2> second, Collector<T> out) throws Exception { for (T1 val1: first) { for (T2 val2: second) { out.collect(wrappedFunction.join(val1, val2)); } } } }JoinCoGroupFunction继承了WrappingFunction，同时实现CoGroupFunction接口定义的coGroup方法，默认是遍历第一个集合，对其每个元素遍历第二个集合，挨个执行wrappedFunction.join，然后发射join数据JoinedStreams定义了私有静态类JoinCoGroupFunction，JoinedStreams的WithWindow对象的apply方法内部使用它将JoinFunction进行包装，然后好调用CoGroupedStreams的WithWindow的apply方法JoinFunction定义的join方法，接收的是两个对象类型参数，而JoinCoGroupFunction定义的coGroup方法，接收的两个Iterable类型参数FlatJoinCoGroupFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/JoinedStreams.java /* * CoGroup function that does a nested-loop join to get the join result. (FlatJoin version) */ private static class FlatJoinCoGroupFunction<T1, T2, T> extends WrappingFunction<FlatJoinFunction<T1, T2, T>> implements CoGroupFunction<T1, T2, T> { private static final long serialVersionUID = 1L; public FlatJoinCoGroupFunction(FlatJoinFunction<T1, T2, T> wrappedFunction) { super(wrappedFunction); } @Override public void coGroup(Iterable<T1> first, Iterable<T2> second, Collector<T> out) throws Exception { for (T1 val1: first) { for (T2 val2: second) { wrappedFunction.join(val1, val2, out); } } } }FlatJoinCoGroupFunction继承了WrappingFunction，同时实现CoGroupFunction接口定义的coGroup方法，默认是遍历第一个集合，对其每个元素遍历第二个集合，挨个执行wrappedFunction.join，然后发射join数据JoinedStreams定义了私有静态类FlatJoinCoGroupFunction，JoinedStreams的WithWindow对象的apply方法内部使用它将FlatJoinFunction进行包装，然后好调用CoGroupedStreams的WithWindow的apply方法FlatJoinFunction定义的join方法，接收的是两个对象类型参数，而FlatJoinCoGroupFunction定义的coGroup方法，接收的两个Iterable类型参数小结DataStream提供了join方法，用于执行join操作，它返回的是JoinedStreams；JoinedStreams主要是提供where操作来构建Where对象；Where对象主要提供equalTo操作用于构建EqualTo对象；EqualTo对象提供window操作用于构建WithWindow对象；WithWindow可以设置windowAssigner、trigger、evictor、allowedLateness，它提供apply操作apply操作可以接收JoinFunction或者FlatJoinFunction，它内部是使用DataStream的coGroup方法创建CoGroupedStreams，之后将自身的where及equalTo的keySelector、windowAssigner、trigger、evictor、allowedLateness都设置给CoGroupedStreams，最后调用CoGroupedStreams的WithWindow对象的apply方法；JoinFunction及FlatJoinFunction都继承了Function、Serializable，它定义了join操作，默认是inner join的语义，如果需要outer join，可以使用CoGroupFunction；而FlatJoinFunction与JoinFunction的join的不同之处的在于FlatJoinFunction的join方法多了Collector参数，可以用来发射0条、1条或者多条数据，所以是Flat命名CoGroupedStreams的WithWindow对象的apply方法与JoinedStreams的WithWindow对象的apply方法参数不同，CoGroupedStreams的WithWindow的apply方法接收的是CoGroupFunction，因而JoinedStreams的WithWindow对象的apply方法内部将JoinFunction或者FlatJoinFunction包装为CoGroupFunction(JoinFunction使用JoinCoGroupFunction包装，FlatJoinFunction使用FlatJoinCoGroupFunction包装)传递给CoGroupedStreams的WithWindow的apply方法；JoinCoGroupFunction与FlatJoinCoGroupFunction都继承了WrappingFunction(它继承了AbstractRichFunction，这里它覆盖了父类的open、close、setRuntimeContext方法，用于管理wrappedFunction)，同时实现CoGroupFunction接口定义的coGroup方法，不同的是一个是包装JoinFunction，一个是包装FlatJoinFunction，不同的是后者是包装FlatJoinFunction，因而join方法多传递了out参数docJoiningFlink 原理与实现：数据流上的类型和操作JoinedStreams与CoGroupedStreams实现原理和区别 ...

聊聊flink的consecutive windowed operations

序本文主要研究一下flink的consecutive windowed operations实例DataStream<Integer> input = …;DataStream<Integer> resultsPerKey = input .keyBy(<key selector>) .window(TumblingEventTimeWindows.of(Time.seconds(5))) .reduce(new Summer());DataStream<Integer> globalResults = resultsPerKey .windowAll(TumblingEventTimeWindows.of(Time.seconds(5))) .process(new TopKWindowFunction());本实例首先根据key进行partition，然后再按指定的window对这些key进行计数，之后对该dataStream进行windowAll操作，其时间WindowAssigner与前面的相同，这样可以达到在同样的时间窗口内先partition汇总，再全局汇总的效果(可以解决类似top-k elements的问题)TimestampsAndPeriodicWatermarksOperatorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/runtime/operators/TimestampsAndPeriodicWatermarksOperator.javapublic class TimestampsAndPeriodicWatermarksOperator<T> extends AbstractUdfStreamOperator<T, AssignerWithPeriodicWatermarks<T>> implements OneInputStreamOperator<T, T>, ProcessingTimeCallback { private static final long serialVersionUID = 1L; private transient long watermarkInterval; private transient long currentWatermark; public TimestampsAndPeriodicWatermarksOperator(AssignerWithPeriodicWatermarks<T> assigner) { super(assigner); this.chainingStrategy = ChainingStrategy.ALWAYS; } @Override public void open() throws Exception { super.open(); currentWatermark = Long.MIN_VALUE; watermarkInterval = getExecutionConfig().getAutoWatermarkInterval(); if (watermarkInterval > 0) { long now = getProcessingTimeService().getCurrentProcessingTime(); getProcessingTimeService().registerTimer(now + watermarkInterval, this); } } @Override public void processElement(StreamRecord<T> element) throws Exception { final long newTimestamp = userFunction.extractTimestamp(element.getValue(), element.hasTimestamp() ? element.getTimestamp() : Long.MIN_VALUE); output.collect(element.replace(element.getValue(), newTimestamp)); } @Override public void onProcessingTime(long timestamp) throws Exception { // register next timer Watermark newWatermark = userFunction.getCurrentWatermark(); if (newWatermark != null && newWatermark.getTimestamp() > currentWatermark) { currentWatermark = newWatermark.getTimestamp(); // emit watermark output.emitWatermark(newWatermark); } long now = getProcessingTimeService().getCurrentProcessingTime(); getProcessingTimeService().registerTimer(now + watermarkInterval, this); } /** * Override the base implementation to completely ignore watermarks propagated from * upstream (we rely only on the {@link AssignerWithPeriodicWatermarks} to emit * watermarks from here). / @Override public void processWatermark(Watermark mark) throws Exception { // if we receive a Long.MAX_VALUE watermark we forward it since it is used // to signal the end of input and to not block watermark progress downstream if (mark.getTimestamp() == Long.MAX_VALUE && currentWatermark != Long.MAX_VALUE) { currentWatermark = Long.MAX_VALUE; output.emitWatermark(mark); } } @Override public void close() throws Exception { super.close(); // emit a final watermark Watermark newWatermark = userFunction.getCurrentWatermark(); if (newWatermark != null && newWatermark.getTimestamp() > currentWatermark) { currentWatermark = newWatermark.getTimestamp(); // emit watermark output.emitWatermark(newWatermark); } }}假设assignTimestampsAndWatermarks使用的是AssignerWithPeriodicWatermarks类型的参数，那么创建的是TimestampsAndPeriodicWatermarksOperator；它在open的时候根据指定的watermarkInterval注册了一个延时任务该延时任务会回调onProcessingTime方法，而onProcessingTime在这里则会调用AssignerWithPeriodicWatermarks的getCurrentWatermark方法获取watermark，然后重新注册新的延时任务，延时时间为getProcessingTimeService().getCurrentProcessingTime()+watermarkInterval；这里的watermarkInterval即为env.getConfig().setAutoWatermarkInterval设置的值AssignerWithPeriodicWatermarks的getCurrentWatermark方法除了注册延时任务实现不断定时的效果外，还会在新的watermark值大于currentWatermark的条件下发射watermarkSystemProcessingTimeServiceflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/runtime/tasks/SystemProcessingTimeService.javapublic class SystemProcessingTimeService extends ProcessingTimeService { private static final Logger LOG = LoggerFactory.getLogger(SystemProcessingTimeService.class); private static final int STATUS_ALIVE = 0; private static final int STATUS_QUIESCED = 1; private static final int STATUS_SHUTDOWN = 2; // ———————————————————————— /* The containing task that owns this time service provider. / private final AsyncExceptionHandler task; /* The lock that timers acquire upon triggering. / private final Object checkpointLock; /* The executor service that schedules and calls the triggers of this task. / private final ScheduledThreadPoolExecutor timerService; private final AtomicInteger status; public SystemProcessingTimeService(AsyncExceptionHandler failureHandler, Object checkpointLock) { this(failureHandler, checkpointLock, null); } public SystemProcessingTimeService( AsyncExceptionHandler task, Object checkpointLock, ThreadFactory threadFactory) { this.task = checkNotNull(task); this.checkpointLock = checkNotNull(checkpointLock); this.status = new AtomicInteger(STATUS_ALIVE); if (threadFactory == null) { this.timerService = new ScheduledThreadPoolExecutor(1); } else { this.timerService = new ScheduledThreadPoolExecutor(1, threadFactory); } // tasks should be removed if the future is canceled this.timerService.setRemoveOnCancelPolicy(true); // make sure shutdown removes all pending tasks this.timerService.setContinueExistingPeriodicTasksAfterShutdownPolicy(false); this.timerService.setExecuteExistingDelayedTasksAfterShutdownPolicy(false); } @Override public long getCurrentProcessingTime() { return System.currentTimeMillis(); } @Override public ScheduledFuture<?> registerTimer(long timestamp, ProcessingTimeCallback target) { // delay the firing of the timer by 1 ms to align the semantics with watermark. A watermark // T says we won’t see elements in the future with a timestamp smaller or equal to T. // With processing time, we therefore need to delay firing the timer by one ms. long delay = Math.max(timestamp - getCurrentProcessingTime(), 0) + 1; // we directly try to register the timer and only react to the status on exception // that way we save unnecessary volatile accesses for each timer try { return timerService.schedule( new TriggerTask(status, task, checkpointLock, target, timestamp), delay, TimeUnit.MILLISECONDS); } catch (RejectedExecutionException e) { final int status = this.status.get(); if (status == STATUS_QUIESCED) { return new NeverCompleteFuture(delay); } else if (status == STATUS_SHUTDOWN) { throw new IllegalStateException(“Timer service is shut down”); } else { // something else happened, so propagate the exception throw e; } } } //……}SystemProcessingTimeService的registerTimer方法根据指定的timestamp注册了一个延时任务TriggerTask；timerService为JDK自带的ScheduledThreadPoolExecutor；TriggerTask的run方法会在service状态为STATUS_LIVE时，触发ProcessingTimeCallback(这里为TimestampsAndPeriodicWatermarksOperator)的onProcessingTime方法WindowOperatorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/runtime/operators/windowing/WindowOperator.java@Internalpublic class WindowOperator<K, IN, ACC, OUT, W extends Window> extends AbstractUdfStreamOperator<OUT, InternalWindowFunction<ACC, OUT, K, W>> implements OneInputStreamOperator<IN, OUT>, Triggerable<K, W> { //…… @Override public void processElement(StreamRecord<IN> element) throws Exception { final Collection<W> elementWindows = windowAssigner.assignWindows( element.getValue(), element.getTimestamp(), windowAssignerContext); //if element is handled by none of assigned elementWindows boolean isSkippedElement = true; final K key = this.<K>getKeyedStateBackend().getCurrentKey(); if (windowAssigner instanceof MergingWindowAssigner) { //…… } else { for (W window: elementWindows) { // drop if the window is already late if (isWindowLate(window)) { continue; } isSkippedElement = false; windowState.setCurrentNamespace(window); windowState.add(element.getValue()); triggerContext.key = key; triggerContext.window = window; TriggerResult triggerResult = triggerContext.onElement(element); if (triggerResult.isFire()) { ACC contents = windowState.get(); if (contents == null) { continue; } emitWindowContents(window, contents); } if (triggerResult.isPurge()) { windowState.clear(); } registerCleanupTimer(window); } } // side output input event if // element not handled by any window // late arriving tag has been set // windowAssigner is event time and current timestamp + allowed lateness no less than element timestamp if (isSkippedElement && isElementLate(element)) { if (lateDataOutputTag != null){ sideOutput(element); } else { this.numLateRecordsDropped.inc(); } } } /* * Emits the contents of the given window using the {@link InternalWindowFunction}. */ @SuppressWarnings(“unchecked”) private void emitWindowContents(W window, ACC contents) throws Exception { timestampedCollector.setAbsoluteTimestamp(window.maxTimestamp()); processContext.window = window; userFunction.process(triggerContext.key, window, processContext, contents, timestampedCollector); } //……}WindowOperator的processElement方法会把element添加到windowState，这里为HeapAggregatingState，即在内存中累积，之后调用triggerContext.onElement方法(里头使用的是trigger.onElement方法，这里的trigger为EventTimeTrigger)获取TriggerResult，如果需要fire，则会触发emitWindowContents，如果需要purge则会清空windowState；emitWindowContents则是调用userFunction.process执行用户定义的窗口操作EventTimeTriggerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/triggers/EventTimeTrigger.java@PublicEvolvingpublic class EventTimeTrigger extends Trigger<Object, TimeWindow> { private static final long serialVersionUID = 1L; private EventTimeTrigger() {} @Override public TriggerResult onElement(Object element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception { if (window.maxTimestamp() <= ctx.getCurrentWatermark()) { // if the watermark is already past the window fire immediately return TriggerResult.FIRE; } else { ctx.registerEventTimeTimer(window.maxTimestamp()); return TriggerResult.CONTINUE; } } @Override public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) { return time == window.maxTimestamp() ? TriggerResult.FIRE : TriggerResult.CONTINUE; } @Override public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception { return TriggerResult.CONTINUE; } @Override public void clear(TimeWindow window, TriggerContext ctx) throws Exception { ctx.deleteEventTimeTimer(window.maxTimestamp()); } @Override public boolean canMerge() { return true; } @Override public void onMerge(TimeWindow window, OnMergeContext ctx) { // only register a timer if the watermark is not yet past the end of the merged window // this is in line with the logic in onElement(). If the watermark is past the end of // the window onElement() will fire and setting a timer here would fire the window twice. long windowMaxTimestamp = window.maxTimestamp(); if (windowMaxTimestamp > ctx.getCurrentWatermark()) { ctx.registerEventTimeTimer(windowMaxTimestamp); } } @Override public String toString() { return “EventTimeTrigger()”; } public static EventTimeTrigger create() { return new EventTimeTrigger(); }}EventTimeTrigger的onElement方法会判断，如果window.maxTimestamp() <= ctx.getCurrentWatermark()则会返回TriggerResult.FIRE，告知WindowOperator可以emitWindowContents小结flink支持consecutive windowed operations，比如先根据key进行partition，然后再按指定的window对这些key进行计数，之后对该dataStream进行windowAll操作，其时间WindowAssigner与前面的相同，这样可以达到在同样的时间窗口内先partition汇总，再全局汇总的效果(可以解决类似top-k elements的问题)AssignerWithPeriodicWatermarks或者AssignerWithPunctuatedWatermarks它们有两个功能，一个是从element提取timestamp作为eventTime，一个就是发射watermark；由于element实际上不一定是严格按eventTime时间到来的，可能存在乱序，因而watermark的作用就是限制迟到的数据进入窗口，不让窗口无限等待迟到的可能属于该窗口的element，即告知窗口eventTime小于等于该watermark的元素可以认为都到达了(窗口可以根据自己设定的时间范围，借助trigger判断是否可以关闭窗口然后开始对该窗口数据执行相关操作)；对于consecutive windowed operations来说，上游的watermark会forward给下游的operationsTrigger的作用就是告知WindowOperator什么时候可以对关闭该窗口开始对该窗口数据执行相关操作(返回TriggerResult.FIRE的情况下)，对于EventTimeTrigger来说，其onElement方法的判断逻辑跟watermark相关，如果window.maxTimestamp() <= ctx.getCurrentWatermark()则会返回TriggerResult.FIREdocConsecutive windowed operations ...

聊聊flink的Allowed Lateness

序本文主要研究一下flink的Allowed LatenessWindowedStreamflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/WindowedStream.java@Publicpublic class WindowedStream<T, K, W extends Window> { /** The keyed data stream that is windowed by this stream. / private final KeyedStream<T, K> input; /* The window assigner. / private final WindowAssigner<? super T, W> windowAssigner; /* The trigger that is used for window evaluation/emission. / private Trigger<? super T, ? super W> trigger; /* The evictor that is used for evicting elements before window evaluation. / private Evictor<? super T, ? super W> evictor; /* The user-specified allowed lateness. / private long allowedLateness = 0L; /* * Side output {@code OutputTag} for late data. If no tag is set late data will simply be * dropped. */ private OutputTag<T> lateDataOutputTag; @PublicEvolving public WindowedStream<T, K, W> allowedLateness(Time lateness) { final long millis = lateness.toMilliseconds(); checkArgument(millis >= 0, “The allowed lateness cannot be negative.”); this.allowedLateness = millis; return this; } @PublicEvolving public WindowedStream<T, K, W> sideOutputLateData(OutputTag<T> outputTag) { Preconditions.checkNotNull(outputTag, “Side output tag must not be null.”); this.lateDataOutputTag = input.getExecutionEnvironment().clean(outputTag); return this; } //…… public <R> SingleOutputStreamOperator<R> reduce( ReduceFunction<T> reduceFunction, WindowFunction<T, R, K, W> function, TypeInformation<R> resultType) { if (reduceFunction instanceof RichFunction) { throw new UnsupportedOperationException(“ReduceFunction of reduce can not be a RichFunction.”); } //clean the closures function = input.getExecutionEnvironment().clean(function); reduceFunction = input.getExecutionEnvironment().clean(reduceFunction); final String opName = generateOperatorName(windowAssigner, trigger, evictor, reduceFunction, function); KeySelector<T, K> keySel = input.getKeySelector(); OneInputStreamOperator<T, R> operator; if (evictor != null) { @SuppressWarnings({“unchecked”, “rawtypes”}) TypeSerializer<StreamRecord<T>> streamRecordSerializer = (TypeSerializer<StreamRecord<T>>) new StreamElementSerializer(input.getType().createSerializer(getExecutionEnvironment().getConfig())); ListStateDescriptor<StreamRecord<T>> stateDesc = new ListStateDescriptor<>(“window-contents”, streamRecordSerializer); operator = new EvictingWindowOperator<>(windowAssigner, windowAssigner.getWindowSerializer(getExecutionEnvironment().getConfig()), keySel, input.getKeyType().createSerializer(getExecutionEnvironment().getConfig()), stateDesc, new InternalIterableWindowFunction<>(new ReduceApplyWindowFunction<>(reduceFunction, function)), trigger, evictor, allowedLateness, lateDataOutputTag); } else { ReducingStateDescriptor<T> stateDesc = new ReducingStateDescriptor<>(“window-contents”, reduceFunction, input.getType().createSerializer(getExecutionEnvironment().getConfig())); operator = new WindowOperator<>(windowAssigner, windowAssigner.getWindowSerializer(getExecutionEnvironment().getConfig()), keySel, input.getKeyType().createSerializer(getExecutionEnvironment().getConfig()), stateDesc, new InternalSingleValueWindowFunction<>(function), trigger, allowedLateness, lateDataOutputTag); } return input.transform(opName, resultType, operator); } //……}WindowedStream有两个参数跟Allowed Lateness相关，一个是allowedLateness，用于指定允许元素迟到的时间长度，一个是lateDataOutputTag，用于配置迟到元素的输出WindowedStream的reduce、aggregate、fold、process等操作里头会根据evictor是否为null来创建不同的WindowOperator(evictor不为null创建的是EvictingWindowOperator，evictor为null创建的是WindowOperator)EvictingWindowOperator继承了WindowOperator，其构造器比WindowOperator多了Evictor参数，但它们构造器都需要Trigger、allowedLateness、lateDataOutputTag参数OutputTagflink-core-1.7.0-sources.jar!/org/apache/flink/util/OutputTag.java@PublicEvolvingpublic class OutputTag<T> implements Serializable { private static final long serialVersionUID = 2L; private final String id; private final TypeInformation<T> typeInfo; public OutputTag(String id) { Preconditions.checkNotNull(id, “OutputTag id cannot be null.”); Preconditions.checkArgument(!id.isEmpty(), “OutputTag id must not be empty.”); this.id = id; try { this.typeInfo = TypeExtractor.createTypeInfo(this, OutputTag.class, getClass(), 0); } catch (InvalidTypesException e) { throw new InvalidTypesException(“Could not determine TypeInformation for the OutputTag type. " + “The most common reason is forgetting to make the OutputTag an anonymous inner class. " + “It is also not possible to use generic type variables with OutputTags, such as ‘Tuple2<A, B>’.”, e); } } public OutputTag(String id, TypeInformation<T> typeInfo) { Preconditions.checkNotNull(id, “OutputTag id cannot be null.”); Preconditions.checkArgument(!id.isEmpty(), “OutputTag id must not be empty.”); this.id = id; this.typeInfo = Preconditions.checkNotNull(typeInfo, “TypeInformation cannot be null.”); } // ———————————————————————— public String getId() { return id; } public TypeInformation<T> getTypeInfo() { return typeInfo; } // ———————————————————————— @Override public boolean equals(Object obj) { return obj instanceof OutputTag && ((OutputTag) obj).id.equals(this.id); } @Override public int hashCode() { return id.hashCode(); } @Override public String toString() { return “OutputTag(” + getTypeInfo() + “, " + id + “)”; }}OutputTag是一个带有名称及类型信息的side output标识；flink允许ProcessFunction、CoProcessFunction、ProcessWindowFunction、ProcessAllWindowFunction这些function输出side output，这些function的Context有一个output(OutputTag<X> outputTag, X value)方法用于输出元素到side outputSingleOutputStreamOperatorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/SingleOutputStreamOperator.java@Publicpublic class SingleOutputStreamOperator<T> extends DataStream<T> { protected boolean nonParallel = false; private Map<OutputTag<?>, TypeInformation> requestedSideOutputs = new HashMap<>(); private boolean wasSplitApplied = false; //…… public <X> DataStream<X> getSideOutput(OutputTag<X> sideOutputTag) { if (wasSplitApplied) { throw new UnsupportedOperationException(“getSideOutput() and split() may not be called on the same DataStream. " + “As a work-around, please add a no-op map function before the split() call.”); } sideOutputTag = clean(requireNonNull(sideOutputTag)); // make a defensive copy sideOutputTag = new OutputTag<X>(sideOutputTag.getId(), sideOutputTag.getTypeInfo()); TypeInformation<?> type = requestedSideOutputs.get(sideOutputTag); if (type != null && !type.equals(sideOutputTag.getTypeInfo())) { throw new UnsupportedOperationException(“A side output with a matching id was " + “already requested with a different type. This is not allowed, side output " + “ids need to be unique.”); } requestedSideOutputs.put(sideOutputTag, sideOutputTag.getTypeInfo()); SideOutputTransformation<X> sideOutputTransformation = new SideOutputTransformation<>(this.getTransformation(), sideOutputTag); return new DataStream<>(this.getExecutionEnvironment(), sideOutputTransformation); }}SingleOutputStreamOperator提供了getSideOutput方法，可以根据OutputTag来获取之前在function里头输出的site outputWindowOperatorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/runtime/operators/windowing/WindowOperator.java@Internalpublic class WindowOperator<K, IN, ACC, OUT, W extends Window> extends AbstractUdfStreamOperator<OUT, InternalWindowFunction<ACC, OUT, K, W>> implements OneInputStreamOperator<IN, OUT>, Triggerable<K, W> { //…… public void processElement(StreamRecord<IN> element) throws Exception { final Collection<W> elementWindows = windowAssigner.assignWindows( element.getValue(), element.getTimestamp(), windowAssignerContext); //if element is handled by none of assigned elementWindows boolean isSkippedElement = true; final K key = this.<K>getKeyedStateBackend().getCurrentKey(); if (windowAssigner instanceof MergingWindowAssigner) { MergingWindowSet<W> mergingWindows = getMergingWindowSet(); for (W window: elementWindows) { // adding the new window might result in a merge, in that case the actualWindow // is the merged window and we work with that. If we don’t merge then // actualWindow == window W actualWindow = mergingWindows.addWindow(window, new MergingWindowSet.MergeFunction<W>() { @Override public void merge(W mergeResult, Collection<W> mergedWindows, W stateWindowResult, Collection<W> mergedStateWindows) throws Exception { if ((windowAssigner.isEventTime() && mergeResult.maxTimestamp() + allowedLateness <= internalTimerService.currentWatermark())) { throw new UnsupportedOperationException(“The end timestamp of an " + “event-time window cannot become earlier than the current watermark " + “by merging. Current watermark: " + internalTimerService.currentWatermark() + " window: " + mergeResult); } else if (!windowAssigner.isEventTime() && mergeResult.maxTimestamp() <= internalTimerService.currentProcessingTime()) { throw new UnsupportedOperationException(“The end timestamp of a " + “processing-time window cannot become earlier than the current processing time " + “by merging. Current processing time: " + internalTimerService.currentProcessingTime() + " window: " + mergeResult); } triggerContext.key = key; triggerContext.window = mergeResult; triggerContext.onMerge(mergedWindows); for (W m: mergedWindows) { triggerContext.window = m; triggerContext.clear(); deleteCleanupTimer(m); } // merge the merged state windows into the newly resulting state window windowMergingState.mergeNamespaces(stateWindowResult, mergedStateWindows); } }); // drop if the window is already late if (isWindowLate(actualWindow)) { mergingWindows.retireWindow(actualWindow); continue; } isSkippedElement = false; W stateWindow = mergingWindows.getStateWindow(actualWindow); if (stateWindow == null) { throw new IllegalStateException(“Window " + window + " is not in in-flight window set.”); } windowState.setCurrentNamespace(stateWindow); windowState.add(element.getValue()); triggerContext.key = key; triggerContext.window = actualWindow; TriggerResult triggerResult = triggerContext.onElement(element); if (triggerResult.isFire()) { ACC contents = windowState.get(); if (contents == null) { continue; } emitWindowContents(actualWindow, contents); } if (triggerResult.isPurge()) { windowState.clear(); } registerCleanupTimer(actualWindow); } // need to make sure to update the merging state in state mergingWindows.persist(); } else { for (W window: elementWindows) { // drop if the window is already late if (isWindowLate(window)) { continue; } isSkippedElement = false; windowState.setCurrentNamespace(window); windowState.add(element.getValue()); triggerContext.key = key; triggerContext.window = window; TriggerResult triggerResult = triggerContext.onElement(element); if (triggerResult.isFire()) { ACC contents = windowState.get(); if (contents == null) { continue; } emitWindowContents(window, contents); } if (triggerResult.isPurge()) { windowState.clear(); } registerCleanupTimer(window); } } // side output input event if // element not handled by any window // late arriving tag has been set // windowAssigner is event time and current timestamp + allowed lateness no less than element timestamp if (isSkippedElement && isElementLate(element)) { if (lateDataOutputTag != null){ sideOutput(element); } else { this.numLateRecordsDropped.inc(); } } } protected boolean isElementLate(StreamRecord<IN> element){ return (windowAssigner.isEventTime()) && (element.getTimestamp() + allowedLateness <= internalTimerService.currentWatermark()); } private long cleanupTime(W window) { if (windowAssigner.isEventTime()) { long cleanupTime = window.maxTimestamp() + allowedLateness; return cleanupTime >= window.maxTimestamp() ? cleanupTime : Long.MAX_VALUE; } else { return window.maxTimestamp(); } } //……}WindowOperator里头有个isElementLate方法，根据allowedLateness来判断一个element是否late，其processElement方法最后在isSkippedElement为true而且isElementLate也为true的情况下会执行如下逻辑：在lateDataOutputTag不为null的情况下会将late的element输出到sideOutput，如果lateDataOutputTag为null，则执行numLateRecordsDropped.inc()来递增numLateRecordsDropped统计数小结当使用event-time window的时候，flink提供了allowedLateness方法用来配置元素允许的迟到时间，超过该值则会被丢弃(在窗口结束时间+允许迟到时间内到达的元素仍然会被添加到窗口内)，默认该参数设置为0；对于使用GlobalWindows这类window assigner，由于其end时间戳为Long.MAX_VALUE，因此element就无所谓lateOutputTag是一个带有名称及类型信息的side output标识；flink允许ProcessFunction、CoProcessFunction、ProcessWindowFunction、ProcessAllWindowFunction这些function输出side output，这些function的Context有一个output(OutputTag<X> outputTag, X value)方法用于输出元素到side outputSingleOutputStreamOperator提供了getSideOutput方法，可以根据OutputTag来获取之前在function里头输出的site output；WindowOperator的processElement方法在最后会判断，如果isSkippedElement为true而且isElementLate也为true，则在lateDataOutputTag不为null的情况下会将late的element输出到sideOutputdocAllowed LatenessSide OutputsWindow Lifecycle ...

阿里巴巴，果然开始拥有“预测未来”的能力了

顶灯闪烁，笛声响彻。救护车载着病人，冲向茫茫车海，在时间的赛道上狂奔。高德地图、GPS 卫星导航、路面磁感线圈、1300 个路口摄像头同时开动，为这辆救护车勘探最快路线；GPS 传回实时数据，后台根据辅助数据纠偏，锚定救护车每一刻的精确位置；救护车将要经过的沿途，车辆情况被实时计算。确保路口绿灯提前亮起，在救护车通过之前，刚好所有社会车辆已经行驶一空。这不是演习，这是杭州城市大脑每天都在执行的任务。依靠计算，一辆救护车到达医院的速度，平均缩短了 50%。在这座城市，靠鸣笛和闯红灯开道的悲壮彻底成为历史。说人同蝼蚁，其实并不为过。两百多万辆车奔跑在城市里，他们的行踪像风里的落叶一样叵测。但通过对 1300个路口的摄像头的实时计算，城市大脑就可以精确地预测出未来十五分钟、未来半小时那哪个路段将会拥堵，从而第一时间指挥路口信号灯“变换姿势”。计算在帮人类追赶时间。中哥今天要说的，就是这个精致而坚固的“大数据实时计算引擎”。你可能从未听说过这个引擎，甚至在此刻之前都不知道它的存在，但你很可能早已成为这个引擎服务的一员：一年一度的双11，无数人涌进天猫，每个人都能用 0.1 秒搜索到自己理想的商品，在智能推荐中发现适合的宝贝，背后正是依赖这个引擎；双11庆典现场，大屏上那个跳动的总成交量数字，只是背后所有数据的冰山一角。几十亿种商品的实时库存、价格、优惠数据得以分秒不慢地同步给屏幕前的你，也同样依赖这个引擎从某种意义上来说，只要给这个计算引擎足够的资源，无论面对多么庞大复杂的系统，我们都可以用几乎忽略不计的时间看到真相——这大大快于人类最聪明的大脑。这是我们亲手创造的“先知”。重器难成。为了这个先知一般的“大数据实时计算引擎”，阿里巴巴最核心的技术人，已经耗费了将近五年时间。让人感慨的是，这个承载了一个个城市的交通，扛起了一条条生产线，担负了一个国家十几亿人购物的强大引擎之所以的诞生在阿里巴巴，最初并不是为了满足什么需要，而仅仅是因为它“看上去很美”。这是一个鲜为人知的故事。（1）1999年，阿里巴巴在杭州成立。同样在1999年，蒋晓伟正在美国攻读理论物理博士。作为一个初三就立志要探索宇宙秘密的年轻人，到目前为止他的人生堪称完美。就在一个崭新的物理学家即将出炉的时候，命运开始展现它的波云诡谲。蒋晓伟突然被自己的导师“忽悠”到了一家非常有希望的互联网初创公司。理由是：“在30岁之前先财富自由，以后爱怎么学物理就怎么学物理。”一年之后，互联网泡沫破裂。然而，蒋晓伟却留在了这片战场。2002年，他加入微软，2010年他加入 Facebook。弹指挥间，直到回国加入阿里巴巴之前，他已经从物理学家成功转型成为数据库和计算资源调度系统专家。他还记得，自己加入阿里的时间是 2014年12月29日。这是一年中可以办理入职的最后一天。“为什么选最后一天？”“因为看上去比较有美感。”“。。。”目测，蒋晓伟是我见过的第一个用物理公式般的美感对待人生的人。甚至，他给自己起的花名都想叫做“量子”，后来思考了一下，觉得量子不太像个人名，才改为谐音“量仔”。蒋晓伟蒋晓伟入职的是阿里巴巴集团搜索团队。你可能会问：纳尼？阿里巴巴还有搜索团队？当然有，而且还极其重要。举个搜索引擎的日常：当你在淘宝搜索框里输入“杜蕾斯”的时候，搜索引擎就马上行动，从亿万卖家出售中的宝贝里帮你找到合适的 TT（及其他产品），然后按照推荐顺序排列在搜索结果里。注意，有趣的硬核要来了：如果，商家的 TT 价格永远不改，库存永远无限，优惠促销方案永远不变，那么搜索团队只需要做一个最简单的查询系统就行了。但是，现实中商家会随时调整价格和优惠，某一款激情大颗粒也可能因为太受欢迎，上架十秒就卖到缺货。在淘宝网上，你会发现真实的状态是：每时每刻都有无数卖家的产品参数在改动。所以，搜索引擎的挑战就是，要根据每时每刻最新的数据库来瞬间算出最适合呈现给你的搜索结果。相信我，只有用最新鲜的数据算出的结果，才能让屏幕对面的你露出心满意足的表情：面对这种现实，一个最稳妥的方式就是，搜索引擎用把现在的数据库全部算一遍，给出结果。但是，这会耗费大量的计算力。毕竟这一秒相对于上一秒来说，可能发生参数变动的宝贝只有十个，而没有参数变动的宝贝有十万个。那么，你自然会想：“有没有一种方法，让我只计算改动的部分，再通过特别的数学运算和之前的结果融合，就能达到和计算全量数据一样的效果呢？”有的，这就叫“流式计算”。打个最简单的比方：你负责把椰汁平分给10个妹纸。刚开始你有10瓶椰汁，于是你一人分了一个。后来，你又得到了10瓶椰汁，这时候椰汁的总数变成了 20 瓶，平均每个妹纸应该得到两个。但你没有必要把之前分给妹纸的椰汁收回来，重新每人给两个；而是可以让每个妹纸手上拿着之前的那瓶椰汁的基础上，每人再补发一瓶。通过这个例子，我猜你已经感受到了“流式计算”的激荡。当然，实际的数据库运算比“分椰汁”复杂得多。需要说明的是，当时在阿里巴巴内部，并不是没有流式计算引擎，各部门都根据自己的需求研发了特定的流式计算引擎，只不过，大多引擎只用来解决各自部门的问题，没有通用性。很多业务都开发了各自的流式计算引擎但蒋晓伟突然发现，流式计算背后隐藏着一个神奇的事实：既然只计算增量，就能得知全量的结果；那么就可以永远用计算增量的方式来表达计算全量。也就是说：增量计算等效于全量计算；流式计算等效于批处理计算，实时计算等效于离线计算！也就是说，如果按照这个构想做出一套完整功能的“流式计算引擎”，就可以一统江湖，运转在阿里巴巴所有的技术底层。这可是一份不小的产业啊！蒋晓伟越想越鸡冻。然鹅，让他激动的最主要原因竟然是：“这个引擎太完美了！”他发现，其实自己身体里的那个“物理学家”一直都在。物理追求的终极就是“大一统理论”——用一套机制解决所有问题。没想到人生峰回路转，在计算机领域也给发现了一个“大一统”的机会。老实说，蒋晓伟老湿傅这个想法有点危险。危险在哪呢？首先，如果把当时搜索业务需要的流式计算比作汽车发动机的话，蒋晓伟想要研制的发动机，是豪华到可以用到下一代宇宙飞船上的“核能发动机”。自己团队支持的这摊子业务目前根本不需要这么好的引擎。其次，研究这个引擎的基本动力居然是“美感”。出于美感开发一个计算引擎，这种动机天然就有一种理想主义气质。。。能不能研究成，那只有天知道。再说，面对这么宏大的任务，手下能用来做研发的团队，只有五个人。况且这五个兄弟还有日常的任务，人手极度短缺。“但马老师不是说了么，梦想还是要有的，万一实现了呢？”刚刚加入阿里的蒋晓伟倒是决心已定。（2）蒋晓伟“能用”的团队，全员都在北京。这个小分队的老大叫做王峰。王峰是个老阿里了，2006年加入阿里巴巴，在阿里北京的雅虎中国团队做搜索，后来又做过一淘和淘宝搜索。此时此刻，他和北京的几个兄弟主要负责一个开放搜索项目的离线系统。听到蒋晓伟对于“流式计算引擎”的描述，王峰内心惊呼“卧槽”。对于一个合格技术宅来说，一个好的技术构想比萌妹子更能让他动心。蒋晓伟和王峰一合计，事情很简单：脚踩两只船，那基本没戏。要么就趁早死心，放弃新引擎研发；要么就大家就把旧工作完全交出去，破釜沉舟干票大的。王峰的决定是，干！现在的王峰，笑起来一幅波澜不惊，当年内心也是慌得一批。王峰回忆，领导们觉得很不可思议。因为交出原有的业务，北京这个小团队相当于“失业”了。而新的研究——流式计算引擎——当时只是个构想，连技术方向也没有，代码更是一行都还没写。对于王峰来说，这相当于一次破釜沉舟的内部创业，前途未卜，凶险异常。事实也证明，别人的担心都是对的。一开始团队努着劲儿写了三个月代码，仍然没办法达到蒋晓伟理想中的通用性，连他本人都有点心虚。“我刚来阿里巴巴，就忽悠兄弟们把之前的项目都放弃了，要是最后证明我的构想是个坑，那不是害了别人么。。。”他想。焦急之中，已经到了 2015 年夏天，蒋晓伟突然在业内著名的大数据峰会 Hadoop Sumit 的论坛上看到有人发表了一个惊悚的评论：感觉 Flink 出来之后，Hadoop 就显得不怎么需要了。。。Hadoop 是当年最火的大数据分布式架构，这个 Flink 是个神马，根本没听过啊。但是当蒋晓伟、王峰和团队研究完技术资料之后突然发现，这种“用流式计算来等效一切计算”的理念不就和我们想开发的那套引擎一模一样吗？蒋晓伟仰天长啸：真是天助我也！既然已经有开源的技术，那么我们只要在此之上继续开发流计算引擎就好了啊！这里多介绍一句。Flink 是一个流式计算的开源框架，2010 年诞生于德国研究中心和柏林工业大学，2014年被捐赠给 Apache 基金会，并由创始公司 DataArtisans 继续运营。Flink 的 Logo 是一只眼神里有故事的松鼠。简单来说，2015年的时候，Flink 刚刚“出道”一年，几乎没有人知道，更没有人大规模使用。就像一个刚刚毕业的大学生，看上去很有潜力，但“稳定性”和“实用性”都缺乏事实验证。就这样，这帮阿里巴巴的技术专家，成为了全球第一批使用 Flink 框架做大数据引擎研发的人，蒋晓伟一瞬间就给自己的引擎起好了名字——“Blink”。这是英文眨眼的意思。”一眨眼，所有东西都计算好了！“2015年底，搜索部门要向阿里巴巴 CTO 行癫汇报。每人20分钟时间，结果蒋晓伟上去讲 Blink，沉浸在对这个“完美引擎”的想象中，一下就说了40分钟。作为阿里巴巴所有核心技术的掌门人，行癫素来对新技术很敏感。他听懂了蒋晓伟的技术路线，内心也觉得相当靠谱。但这毕竟是搜索团队自己“偷偷”搞的项目，这帮兄弟究竟可以坚持走多远，行癫心里也没底。于是鼓励蒋晓伟说：“那就等你们明年做出来，我们再看！”阿里巴巴 CTO 行癫张建锋（3）说到底，Blink 是一个通用引擎。它就像一个万能发动机，可以装载到轿车、卡车、飞机、火箭任何地方。蒋晓伟手握这台“万能发动机”的1.0版本，到处去找车实验。他盯上的“第一批车”，就是搜索业务中的使用场景。简单科普一下：搜索业务的机器学习平台内部代号叫“保时捷”（还真是一辆车。。。），可以根据你浏览商品的时间和动作，实时判断出你可能会对什么感兴趣，从而在下一秒就能给你智能推荐可能喜欢的商品。这是阿里巴巴非常有技术含量的一个应用。实际上，机器学习平台当时已经“心有所属”，配有一台流式计算引擎——之前王峰带领搜索团队自研的 iStream。iStream 是专门为搜索设计的，虽然目前可以很好地完成任务，但结构简单，不具有特别强的通用性。机器学习算法团队的一位负责人仁基，技术思想非常超前，非常巧的是，他同样是个执着于“美感”的人。他相信，未来 Flink 很可能会成为下一代机器学习算法重要的底层计算框架，于是在 Blink 系统研发的早期，就把团队里一百多位算法工程师的力量都用来配合蒋晓伟。“一两百人的团队，被我一个人折腾。”回忆到这里，蒋晓伟露出了羞赧的表情。说得很美好，结果真拿来 Blink 一用，动不动就躺尸。。。说实话，算法工程师没有义务为 Blink 的技术问题买单。毕竟算法工程师是“生产汽车的”，而 Blink 这个“发动机”质量不稳定，导致人家的汽车备受诟病，可以说相当冤枉了。所以那几个月一百多位算法工程师的日常就是各种吐槽“疯子”蒋晓伟。后来蒋晓伟才知道，这些吐槽，全都被仁基扛下来。仁基尽自己一切所能，在保护着这个弱小的 Blink。终于，2016年5月，第一个基于 Blink 的机器学习小功能“A/B Testing”上线。虽然还存在一些青涩的小毛病，但所有的技术人都看到了，Blink 已经像会呼吸的小兽一样，泛出诱人的引擎光泽。最激动的，当然是蒋晓伟本人。他把自己在 Flink 上成功的应用作为一个演讲，投给了当年的 Hadoop Sumit 大会。非常巧，Flink 的创始人 Kostas 和 Stephan 也在同一个大会上有一个演讲。他们两拨人实际是那次 Hadoop 大会上唯二的 Flink 演讲。Kostas 提前看到了议程，顿感相见恨晚，于是主动联系了蒋晓伟，希望他能用团队研究的成果影响社区。“本来之前是想自己玩玩的，我们连阿里都不敢影响，还敢影响社区？”蒋晓伟说。但是 Kostas 和 Stephan 觉得这群阿里人的尝试简直不要太酷，特别支持。蒋晓伟深受感动，“从那时候开始就觉得，我们不仅得把阿里内部的业务做好，还要为 Flink 社区做贡献，把 Flink 社区做好。”就这样，蒋晓伟和团队就跟组织“接上了头”，成为了 Flink 社区的核心成员。Flink 创始人 Kostas这么帅还来搞技术可以说是相当想不开了（4）在搜索团队内部证明了 Blink 能力，又得到了 Flink 社区的认可，蒋晓伟终于有资格正视自己的“野心”了。他提出要让 Blink 支撑“双11”上的实时机器学习任务，对方同意了。也就是说，双11当天，数亿人在淘宝天猫搜索商品，他们的每次查看，点击，都会影响个性化的智能推荐，在下一秒就能看到为自己量身定做的宝贝推荐。而这背后的实时计算，都要由 Blink 来支撑。然而抬眼一看，夏天已经到了，距离双11只有不到半年了。整个九、十月份，Blink 和机器学习系统的联调都处在各种花式崩溃之中。Blink 还小，压根就没见过双十一这种“人类狂欢”的阵仗。出现了一个死结：一旦超大规模数据进来，Blink 的性能立刻大幅下降。要知道，在 AI 领域，性能就是功能。性能大幅下降的 Blink 分分钟就把人工智能坑成“人工智障”。老程序猿都知道，数据规模是对一个系统最大的考验。一个系统承受不住大规模的数据浪潮，有可能证明这个架构就是无解的。如果真是架构缺陷，那么解决方案只有一个：放弃。带领团队攻坚的王峰回忆，那几天“自己已经崩溃了”。十一假期，所有团队的人都从北京冲到了杭州，别说休假，连觉都不睡了。六七个人就在工位上吃住，寻找究竟是哪个节点出了问题。即使是面对这样的情况，蒋晓伟、王峰，还有其他同事都完全相信，Flink 架构是完美的，问题一定是局部的可解的，只是我们还没找到它。终于，问题找到了！是不同层级算子之间的调度模式需要优化。解决这个问题之后，系统能处理的数据量立刻跃升。十月中旬，Blink 正式切上线。本以为劫波渡尽，没成想又是一大堆系统配合的问题接踵而来。蒋晓伟记得，将近11月，Blink 还有一些问题没搞定。这边基础引擎不搞定，算法团队就没办法在它的基础上调优双11的算法。到最后，算法团队的老大都直接找到蒋晓伟，着急地质问：“你们究竟是怎么回事啊？”现在想想，他的意思可能是想让我别折腾，直接换回去年的旧系统。但我的情商低，当时没听明白。就是一门心思地组织大家调优 Blink。。。蒋晓伟回忆。终于赶在11月前，Blink 完成了联调。原则上，从11月1日开始，双11的系统就要封闭代码，谁都不能动了。但是，这是 Blink 第一次承担这么重大的任务，为了万无一失，相关团队又提了很多冗余性的建议。王峰记得很清楚，一直到11月10日，还有几个小时双11就开始了，代码还最后改了几行，最终封闭。人事已尽，唯听天命。11月11日，巨大的数据像海啸一样涌向 Blink，蒋晓伟和王峰都捏了一把汗。然而，这个年轻的引擎应对自如。第二天，Blink 在阿里巴巴一炮而红。2016年“双11”交易额定格在1207亿（5）你以为故事结束了么？图样图森破。紧随而来的 2017 年对于蒋晓伟来说，简直不要更刺激。意识到大数据引擎这么重要，阿里巴巴集团决定调整组织架构，集全公司之力发展大数据引擎，由原阿里云的首席科学家周靖人组建计算平台事业部，在流式计算方面，把公司发展最好的三个引擎团队合三为一。周靖人他也是阿里巴巴达摩院的“禅师”之一这三个引擎分别是：阿里中间件团队的 JStorm、阿里云的 Galaxy、阿里巴巴搜索团队的 Blink。得知大牛周靖人负责整合三个团队，正在美国参加 Flink 官方大会 Flink Foward 的蒋晓伟和王峰内心有点波澜。他们知道，三个队伍合并之后，很可能在三条技术路线之中选择一条。蒋晓伟当然觉得自己的开源技术路线技术前景最好。但平心而论，Galaxy 的框架同样非常优秀。更关键的问题在于，Galaxy 一直是周靖人团队的成果。虽然在阿里巴巴不会出现因为亲疏远近而偏袒某个技术路线，但不可否认周靖人一定对于 Galaxy 更为熟悉。那时的蒋晓伟，和这个即将成为新领导的周靖人完全不熟悉，他完全无法预测将会发生什么。我担心，不会一回到国内，就没工作了吧。。。。蒋晓伟回忆。回国之后，周靖人来找蒋晓伟，蒋晓伟的心已经快跳到嗓子眼了。周靖人说：“我想把整合之后的团队交给你来负责，你们三人一起商量未来的技术路线，你觉得怎么样？”这意味着，蒋晓伟突然拥有了80人的豪华阵容。那一瞬间他在心里默念：“稳了！”只要不是强制采用某个技术路线，他就有信心说服 Galaxy 和 JStorm 的负责人。技术摆在这里，孰优孰劣是能讲得清道理的。蒋晓伟回忆，三个技术负责人的“谈判”整整维持了一周。大家都知道，这次技术路线的抉择，将会影响阿里巴巴未来十年甚至更远的技术发展，谁都不敢掉以轻心。谈到最后，争夺的焦点就集中在 Blink 和 Galaxy 之间。Flink 的开源生态，最终说服了Galaxy 的支持者。此时的 Flink 已经不像两年那样鲜有人问津，而是已经形成了巨大的社区，中国已经有腾讯、滴滴、美团等公司开始用 Flink 建造自己的流式计算引擎。在这个社区里，会有无数国内外大牛对 Flink 的代码做贡献。建立在这个开源基座上的架构，也会发展得更快速。至此，Blink 正式成为了阿里巴巴计算引擎的王牌军。Flink 社区逐渐声势浩荡（6）王牌军可不是白当的。2017年双十一，Blink 领到了自己的艰巨任务——支持全集团（阿里巴巴、阿里云、菜鸟）的流式计算任务。王峰告诉我，其实2016年双11 Blink 承担的搜索任务，已经是一个重头戏，有过这个经历垫底，再适配很多系统的时候只不过是麻烦一点而已。唯独有一样：Blink 要接管后台所有的交易数据的实时计算任务。交易数据计算，是淘宝天猫业务的最核心。也是支撑背后支付、物流的核心依据。很多其他的计算都要基于订单数据的结果。这就像面包店的面粉一样，无论你做什么蛋糕，都需要面粉。如果面粉的供应出问题，那整个面包店就要关门了。所以无论面临多大的订单量，交易数据计算必须稳定、快速、实时。一旦出现错误，损失无可估量。每年双十一狂欢晚会上的那块大屏幕上显示的实时成交数字，也是由订单数据汇总而成的。也就是说，如果 Blink 当天挂掉，不仅对淘宝天猫的运转影响巨大，还会导致一个略为明显的结果：成交量大屏一直维持“0”，一秒把人丢到全球无死角。2014、2015、2016 这三年，这个核心任务都是由兄弟引擎 Galaxy 来承担的。所有人都想到一个稳妥的方案：2017年“双11”让 Blink 和准备退役的 Galaxy 来个双备份，如果 Blink 临时废掉，还可以用 Galaxy 作为备份顶上，至少不会丢人。然鹅，2016年双11的成交量是1207亿元，按照历年经验推测，2017年的成交量八成是会超过1500亿的（事实证明确实如此，达到了1682亿）。而根据 Galaxy 的技术架构，如果不做大量繁琐的优化，很可能顶不住。初出茅庐的 Blink，就这样成为 2017 年双11媒体大屏“全球指定唯一必须顶上不干不行合作伙伴”。。。双11 当天，两条 Blink 链路互为备份。“虽然成功率基本是100%，但万里有一，假设 Blink 本身设计存在未知的缺陷，或者两条备份链路的机器硬件同时坏掉，都可能导致灾难。”蒋晓伟回忆。在双11到来前一周，王峰带领兄弟们已经把 Blink 引擎调整到无以复加的好状态。蒋晓伟想了想，又派同样是 Facebook 回来的大牛工程师大沙去天竺法喜寺烧了一炷香。。。2017年11月11日零点。狂欢现场。时钟敲响零点，然后出现五秒倒计时。按照流程，留给 Blink 的计算时间只有这五秒。也就是说，00:00:05 的时候，无论如何大屏幕都会切到 Blink 给出的双11前五秒交易总额。这五秒，几乎是蒋晓伟人生当中最漫长的五秒。1、2、3。。。第三秒的时候，蒋晓伟面前的监视器跳出了实时成交数据！再两秒之后，实时交易数据被投上大屏，穹顶之下，欢声雷动。蒋晓伟知道，现场观众并不一定理解大屏运行原理，内心也并没有特地把一份掌声送给幕后的流式计算引擎团队。但那一刻，他热泪盈眶。这几年兄弟们付出的努力值了。168,269,635,159。每一个数字，对蒋晓伟和兄弟们都意味着岁月和付出。（7）经过两年双11的考验，已经没人怀疑 Blink 是阿里巴巴最强悍的计算引擎之一。所以，不仅阿里巴巴集团所有用到流式计算的场景都会选用 Blink，Blink 还开始对外提供服务。虽然在蒋晓伟看来，各个场景的计算都可以用 Blink 来解决，但目前被应用最多的场景有如下几个：1、实时统计分析。在电商行业，尤其是促销的场景中，巨大的网络流量涌来，形势变幻莫测。每一秒的库存统计、订单报表，都能揭示出用户的行为规律。对这些数据进行实时分析，就能随时调整促销策略。2、在线机器学习。用户的行为会展现出他的性格和偏好，用机器学习分析一个人浏览商品的姿势，就能为他精准推荐可能感兴趣的商品。但是，可能一个用户只浏览一分钟，如果在这个时间段内没有能够吸引他的商品，它就会退出。所以必须在一秒钟之内，对他刚才的动作进行实时学习，才能保证他第一时间看到感兴趣的宝贝。3、实时金融风控。在金融领域，技术就是金钱。每成功阻断一次欺诈交易，就等于挽回了真金白银。通过对一个账户实时行为的分析，就可以知道现在它有没有进行危险交易，从而在第一时间阻断。4、IoT 边缘计算。在工厂中，每台生产线都会随时产生数据，如果可以实时对这些数据进行分析，就可以减少生产线的损坏几率，提高产品的良品率。根据参数实时调整生产线如此，才有了开头一幕所说：阿里云承建的城市大脑，可以利用 Blink 来预测道路拥堵，为救护车开拓生命道路。根据阿里云首席科学家闵万里博士的介绍：2018年，城市大脑第一次出国，被部署在马来西亚吉隆坡，把救护车到达现场的时间缩短了 48.9%。借助工业大脑，流式计算实时判断生产线的健康状况，帮助世界第一大光伏企业协鑫光伏提高了良品率1%，每年可以节省上亿元的无谓浪费。2018年12月20日，阿里巴巴将 Flink 的旗舰会议 Flink Foward 第一次引入中国，现场座无虚席。蒋晓伟、王峰和流式计算团队的每一个人，在过去的三年都亲眼见证了 Flink 从踽踽独行到集结成军。Flink Forward 2018 北京为了感谢社区的帮助，在这次会议上周靖人宣布，在未来会把基于 Flink 修改的 Blink 流式计算引擎开源。从2019年1月开始，所有人都可以查阅这个支持了双11、支持了城市大脑、支持了工业IoT等无数顶级计算的引擎代码。也就是在这一年，王峰正式接替蒋晓伟，成为流式计算的新掌门。而蒋晓伟则朝着他的“完美梦想”更进一步，带着一帮兄弟在此基础上研究“带有流式计算引擎的数据存储系统”——交互式查询系统，让这个引擎能够解决更多通用的计算问题。带有流式计算引擎的数据存储系统，听起来有些不知所云。其实，这个世界上最经典的这类系统，其实就是我们的大脑。我们一生中会接受各种信息，这些信息共同构成大脑的资料库，帮助我们预测未来。每当有新的信息进来，我们都会根据这一点点信息增量微调我们对于未来的预测。这种调整，毫无疑问是实时的。我们的祖先不小心触摸野火，从那一刻开始就会告诉自己和家人小心火焰。我们依靠对世界的万亿次反馈，发现了万有引力，发现了相对论，发现了量子力学。正是千万人实时更新的预测能力，构成了我们的文明，也书写了我们的历史。以前，所有关于未来的预测都在我们的脑海里，如今，我们终于有机会在躯体之外，利用人类的武器——计算力——建造起一个硕大的预测引擎。角落里，这些技术英雄笑起来安静而羞涩。但正因他们存在，人类面对未来，再也不是手无寸铁。本文作者：赵慧阅读原文本文为云栖社区原创内容，未经允许不得转载。 ...

聊聊flink的Evictors

序本文主要研究一下flink的EvictorsEvictorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/evictors/Evictor.java@PublicEvolvingpublic interface Evictor<T, W extends Window> extends Serializable { void evictBefore(Iterable<TimestampedValue<T>> elements, int size, W window, EvictorContext evictorContext); void evictAfter(Iterable<TimestampedValue<T>> elements, int size, W window, EvictorContext evictorContext); interface EvictorContext { long getCurrentProcessingTime(); MetricGroup getMetricGroup(); long getCurrentWatermark(); }}Evictor接收两个泛型，一个是element的类型，一个是窗口类型；它定义了evictBefore(在windowing function之前)、evictAfter(在windowing function之后)两个方法，它们都有EvictorContext参数；EvictorContext定义了getCurrentProcessingTime、getMetricGroup、getCurrentWatermark方法CountEvictorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/evictors/CountEvictor.java@PublicEvolvingpublic class CountEvictor<W extends Window> implements Evictor<Object, W> { private static final long serialVersionUID = 1L; private final long maxCount; private final boolean doEvictAfter; private CountEvictor(long count, boolean doEvictAfter) { this.maxCount = count; this.doEvictAfter = doEvictAfter; } private CountEvictor(long count) { this.maxCount = count; this.doEvictAfter = false; } @Override public void evictBefore(Iterable<TimestampedValue<Object>> elements, int size, W window, EvictorContext ctx) { if (!doEvictAfter) { evict(elements, size, ctx); } } @Override public void evictAfter(Iterable<TimestampedValue<Object>> elements, int size, W window, EvictorContext ctx) { if (doEvictAfter) { evict(elements, size, ctx); } } private void evict(Iterable<TimestampedValue<Object>> elements, int size, EvictorContext ctx) { if (size <= maxCount) { return; } else { int evictedCount = 0; for (Iterator<TimestampedValue<Object>> iterator = elements.iterator(); iterator.hasNext();){ iterator.next(); evictedCount++; if (evictedCount > size - maxCount) { break; } else { iterator.remove(); } } } } public static <W extends Window> CountEvictor<W> of(long maxCount) { return new CountEvictor<>(maxCount); } public static <W extends Window> CountEvictor<W> of(long maxCount, boolean doEvictAfter) { return new CountEvictor<>(maxCount, doEvictAfter); }}CountEvictor实现了Evictor接口，其中element类型为Object；它有两个属性，分别是doEvictAfter、maxCount；其中doEvictAfter用于指定是使用evictBefore方法还是evictAfter方法；maxCount为窗口元素个数的阈值，超出则删掉DeltaEvictorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/evictors/DeltaEvictor.java@PublicEvolvingpublic class DeltaEvictor<T, W extends Window> implements Evictor<T, W> { private static final long serialVersionUID = 1L; DeltaFunction<T> deltaFunction; private double threshold; private final boolean doEvictAfter; private DeltaEvictor(double threshold, DeltaFunction<T> deltaFunction) { this.deltaFunction = deltaFunction; this.threshold = threshold; this.doEvictAfter = false; } private DeltaEvictor(double threshold, DeltaFunction<T> deltaFunction, boolean doEvictAfter) { this.deltaFunction = deltaFunction; this.threshold = threshold; this.doEvictAfter = doEvictAfter; } @Override public void evictBefore(Iterable<TimestampedValue<T>> elements, int size, W window, EvictorContext ctx) { if (!doEvictAfter) { evict(elements, size, ctx); } } @Override public void evictAfter(Iterable<TimestampedValue<T>> elements, int size, W window, EvictorContext ctx) { if (doEvictAfter) { evict(elements, size, ctx); } } private void evict(Iterable<TimestampedValue<T>> elements, int size, EvictorContext ctx) { TimestampedValue<T> lastElement = Iterables.getLast(elements); for (Iterator<TimestampedValue<T>> iterator = elements.iterator(); iterator.hasNext();){ TimestampedValue<T> element = iterator.next(); if (deltaFunction.getDelta(element.getValue(), lastElement.getValue()) >= this.threshold) { iterator.remove(); } } } @Override public String toString() { return “DeltaEvictor(” + deltaFunction + “, " + threshold + “)”; } public static <T, W extends Window> DeltaEvictor<T, W> of(double threshold, DeltaFunction<T> deltaFunction) { return new DeltaEvictor<>(threshold, deltaFunction); } public static <T, W extends Window> DeltaEvictor<T, W> of(double threshold, DeltaFunction<T> deltaFunction, boolean doEvictAfter) { return new DeltaEvictor<>(threshold, deltaFunction, doEvictAfter); }}DeltaEvictor实现了Evictor接口，它有三个属性，分别是doEvictAfter、threshold、deltaFunction；其中doEvictAfter用于指定是使用evictBefore方法还是evictAfter方法；threshold为阈值，如果deltaFunction.getDelta方法(每个element与lastElement计算delta)算出来的值大于等于该值，则需要移除该元素TimeEvictorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/evictors/TimeEvictor.java@PublicEvolvingpublic class TimeEvictor<W extends Window> implements Evictor<Object, W> { private static final long serialVersionUID = 1L; private final long windowSize; private final boolean doEvictAfter; public TimeEvictor(long windowSize) { this.windowSize = windowSize; this.doEvictAfter = false; } public TimeEvictor(long windowSize, boolean doEvictAfter) { this.windowSize = windowSize; this.doEvictAfter = doEvictAfter; } @Override public void evictBefore(Iterable<TimestampedValue<Object>> elements, int size, W window, EvictorContext ctx) { if (!doEvictAfter) { evict(elements, size, ctx); } } @Override public void evictAfter(Iterable<TimestampedValue<Object>> elements, int size, W window, EvictorContext ctx) { if (doEvictAfter) { evict(elements, size, ctx); } } private void evict(Iterable<TimestampedValue<Object>> elements, int size, EvictorContext ctx) { if (!hasTimestamp(elements)) { return; } long currentTime = getMaxTimestamp(elements); long evictCutoff = currentTime - windowSize; for (Iterator<TimestampedValue<Object>> iterator = elements.iterator(); iterator.hasNext(); ) { TimestampedValue<Object> record = iterator.next(); if (record.getTimestamp() <= evictCutoff) { iterator.remove(); } } } private boolean hasTimestamp(Iterable<TimestampedValue<Object>> elements) { Iterator<TimestampedValue<Object>> it = elements.iterator(); if (it.hasNext()) { return it.next().hasTimestamp(); } return false; } private long getMaxTimestamp(Iterable<TimestampedValue<Object>> elements) { long currentTime = Long.MIN_VALUE; for (Iterator<TimestampedValue<Object>> iterator = elements.iterator(); iterator.hasNext();){ TimestampedValue<Object> record = iterator.next(); currentTime = Math.max(currentTime, record.getTimestamp()); } return currentTime; } @Override public String toString() { return “TimeEvictor(” + windowSize + “)”; } @VisibleForTesting public long getWindowSize() { return windowSize; } public static <W extends Window> TimeEvictor<W> of(Time windowSize) { return new TimeEvictor<>(windowSize.toMilliseconds()); } public static <W extends Window> TimeEvictor<W> of(Time windowSize, boolean doEvictAfter) { return new TimeEvictor<>(windowSize.toMilliseconds(), doEvictAfter); }}TimeEvictor实现了Evictor接口，其中element类型为Object；它有两个属性，分别是doEvictAfter、windowSize；其中doEvictAfter用于指定是使用evictBefore方法还是evictAfter方法；windowSize用于指定窗口的时间长度，以窗口元素最大时间戳-windowSize为evictCutoff，所有timestamp小于等于evictCutoff的元素都将会被剔除小结Evictor接收两个泛型，一个是element的类型，一个是窗口类型；它定义了evictBefore(在windowing function之前)、evictAfter(在windowing function之后)两个方法，它们都有EvictorContext参数；EvictorContext定义了getCurrentProcessingTime、getMetricGroup、getCurrentWatermark方法Evictor有几个内置的实现类，分别是CountEvictor、DeltaEvictor、TimeEvictor；其中CountEvictor是按窗口元素个数来进行剔除，TimeEvictor是按窗口长度来进行剔除，DeltaEvictor则是根据窗口元素与lastElement的delta与指定的threshold对比来进行剔除如果指定了evictor(evictBefore)则会妨碍任何pre-aggregation操作，因为所有的窗口元素都会在windowing function计算之前先执行evictor操作；另外就是flink不保障窗口元素的顺序，也就是evictor如果有按窗口开头或末尾剔除元素，可能剔除的元素实际上并不是最先或最后达到的docEvictors ...

聊聊flink的Triggers

序本文主要研究一下flink的TriggersTriggerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/triggers/Trigger.java@PublicEvolvingpublic abstract class Trigger<T, W extends Window> implements Serializable { private static final long serialVersionUID = -4104633972991191369L; public abstract TriggerResult onElement(T element, long timestamp, W window, TriggerContext ctx) throws Exception; public abstract TriggerResult onProcessingTime(long time, W window, TriggerContext ctx) throws Exception; public abstract TriggerResult onEventTime(long time, W window, TriggerContext ctx) throws Exception; public boolean canMerge() { return false; } public void onMerge(W window, OnMergeContext ctx) throws Exception { throw new UnsupportedOperationException(“This trigger does not support merging.”); } public abstract void clear(W window, TriggerContext ctx) throws Exception; // ———————————————————————— public interface TriggerContext { long getCurrentProcessingTime(); MetricGroup getMetricGroup(); long getCurrentWatermark(); void registerProcessingTimeTimer(long time); void registerEventTimeTimer(long time); void deleteProcessingTimeTimer(long time); void deleteEventTimeTimer(long time); <S extends State> S getPartitionedState(StateDescriptor<S, ?> stateDescriptor); @Deprecated <S extends Serializable> ValueState<S> getKeyValueState(String name, Class<S> stateType, S defaultState); @Deprecated <S extends Serializable> ValueState<S> getKeyValueState(String name, TypeInformation<S> stateType, S defaultState); } public interface OnMergeContext extends TriggerContext { <S extends MergingState<?, ?>> void mergePartitionedState(StateDescriptor<S, ?> stateDescriptor); }}Trigger接收两个泛型，一个是element类型，一个是窗口类型；它定义了onElement、onProcessingTime、onEventTime、canMerge、onMerge、clear几个方法，其中onElement、onProcessingTime、onEventTime均需要返回TriggerResultonElement在每个element添加到window的时候会被回调；onProcessingTime在注册的event-time timer触发时会被回调；onEventTime在注册的processing-time timer触发时会被回调canMerge用于标识是否支持trigger state的合并，默认返回false；onMerge在多个window合并的时候会被触发；clear用于清除TriggerContext中存储的相关stateTrigger还定义了TriggerContext及OnMergeContext；TriggerContext定义了注册及删除EventTimeTimer、ProcessingTimeTimer方法，同时还定义了getCurrentProcessingTime、getMetricGroup、getCurrentWatermark、getPartitionedState、getKeyValueState、getKeyValueState方法OnMergeContext继承了TriggerContext，它多定义了mergePartitionedState方法TriggerResultflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/triggers/TriggerResult.javapublic enum TriggerResult { CONTINUE(false, false), FIRE_AND_PURGE(true, true), FIRE(true, false), PURGE(false, true); // ———————————————————————— private final boolean fire; private final boolean purge; TriggerResult(boolean fire, boolean purge) { this.purge = purge; this.fire = fire; } public boolean isFire() { return fire; } public boolean isPurge() { return purge; }}TriggerResult用于表示trigger在onElement、onProcessingTime、onEventTime被回调时返回的action枚举，它有fire、purge两个属性，CONTINUE、FIRE_AND_PURGE、FIRE、PURGE五个枚举fire表示是否要触发window的computation操作；而purge表示是否要清理window的窗口数据CONTINUE表示不对window做任何操作；FIRE_AND_PURGE表示要触发window的computation操作然后清理window的窗口数据；FIRE表示仅仅触发window的computation操作但不清理window的窗口数据；PURGE表示不触发window的computation操作但是要清理window的窗口数据EventTimeTriggerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/triggers/EventTimeTrigger.java@PublicEvolvingpublic class EventTimeTrigger extends Trigger<Object, TimeWindow> { private static final long serialVersionUID = 1L; private EventTimeTrigger() {} @Override public TriggerResult onElement(Object element, long timestamp, TimeWindow window, TriggerContext ctx) throws Exception { if (window.maxTimestamp() <= ctx.getCurrentWatermark()) { // if the watermark is already past the window fire immediately return TriggerResult.FIRE; } else { ctx.registerEventTimeTimer(window.maxTimestamp()); return TriggerResult.CONTINUE; } } @Override public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) { return time == window.maxTimestamp() ? TriggerResult.FIRE : TriggerResult.CONTINUE; } @Override public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) throws Exception { return TriggerResult.CONTINUE; } @Override public void clear(TimeWindow window, TriggerContext ctx) throws Exception { ctx.deleteEventTimeTimer(window.maxTimestamp()); } @Override public boolean canMerge() { return true; } @Override public void onMerge(TimeWindow window, OnMergeContext ctx) { // only register a timer if the watermark is not yet past the end of the merged window // this is in line with the logic in onElement(). If the watermark is past the end of // the window onElement() will fire and setting a timer here would fire the window twice. long windowMaxTimestamp = window.maxTimestamp(); if (windowMaxTimestamp > ctx.getCurrentWatermark()) { ctx.registerEventTimeTimer(windowMaxTimestamp); } } @Override public String toString() { return “EventTimeTrigger()”; } public static EventTimeTrigger create() { return new EventTimeTrigger(); }}EventTimeTrigger继承了Trigger，element类型为Object，窗口类型为TimeWindow；SlidingEventTimeWindows、TumblingEventTimeWindows、EventTimeSessionWindows、DynamicEventTimeSessionWindows默认都使用EventTimeTriggeronElement在window.maxTimestamp()小于等于ctx.getCurrentWatermark()的时候，返回TriggerResult.FIRE，否则执行ctx.registerEventTimeTimer(window.maxTimestamp())，然后返回TriggerResult.CONTINUE；onEventTime在time等于window.maxTimestamp()的时候返回TriggerResult.FIRE，否则返回TriggerResult.CONTINUE；onProcessingTime则返回TriggerResult.CONTINUEcanMerge返回true；onMerge在window.maxTimestamp()大于ctx.getCurrentWatermark()的时候会执行ctx.registerEventTimeTimer(windowMaxTimestamp)；clear则执行ctx.deleteEventTimeTimer(window.maxTimestamp())ProcessingTimeTriggerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/triggers/ProcessingTimeTrigger.java@PublicEvolvingpublic class ProcessingTimeTrigger extends Trigger<Object, TimeWindow> { private static final long serialVersionUID = 1L; private ProcessingTimeTrigger() {} @Override public TriggerResult onElement(Object element, long timestamp, TimeWindow window, TriggerContext ctx) { ctx.registerProcessingTimeTimer(window.maxTimestamp()); return TriggerResult.CONTINUE; } @Override public TriggerResult onEventTime(long time, TimeWindow window, TriggerContext ctx) throws Exception { return TriggerResult.CONTINUE; } @Override public TriggerResult onProcessingTime(long time, TimeWindow window, TriggerContext ctx) { return TriggerResult.FIRE; } @Override public void clear(TimeWindow window, TriggerContext ctx) throws Exception { ctx.deleteProcessingTimeTimer(window.maxTimestamp()); } @Override public boolean canMerge() { return true; } @Override public void onMerge(TimeWindow window, OnMergeContext ctx) { // only register a timer if the time is not yet past the end of the merged window // this is in line with the logic in onElement(). If the time is past the end of // the window onElement() will fire and setting a timer here would fire the window twice. long windowMaxTimestamp = window.maxTimestamp(); if (windowMaxTimestamp > ctx.getCurrentProcessingTime()) { ctx.registerProcessingTimeTimer(windowMaxTimestamp); } } @Override public String toString() { return “ProcessingTimeTrigger()”; } public static ProcessingTimeTrigger create() { return new ProcessingTimeTrigger(); }}ProcessingTimeTrigger继承了Trigger，element类型为Object，窗口类型为TimeWindow；SlidingProcessingTimeWindows、TumblingProcessingTimeWindows、ProcessingTimeSessionWindows、DynamicProcessingTimeSessionWindows默认都使用ProcessingTimeTriggeronElement执行ctx.registerProcessingTimeTimer(window.maxTimestamp())，然后返回TriggerResult.CONTINUE；onEventTime返回TriggerResult.CONTINUE；onProcessingTime则返回TriggerResult.FIREcanMerge返回true；onMerge在window.maxTimestamp()大于ctx.getCurrentWatermark()的时候会执行ctx.registerProcessingTimeTimer(windowMaxTimestamp)；clear则执行ctx.deleteProcessingTimeTimer(window.maxTimestamp())NeverTriggerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/GlobalWindows.java @Internal public static class NeverTrigger extends Trigger<Object, GlobalWindow> { private static final long serialVersionUID = 1L; @Override public TriggerResult onElement(Object element, long timestamp, GlobalWindow window, TriggerContext ctx) { return TriggerResult.CONTINUE; } @Override public TriggerResult onEventTime(long time, GlobalWindow window, TriggerContext ctx) { return TriggerResult.CONTINUE; } @Override public TriggerResult onProcessingTime(long time, GlobalWindow window, TriggerContext ctx) { return TriggerResult.CONTINUE; } @Override public void clear(GlobalWindow window, TriggerContext ctx) throws Exception {} @Override public void onMerge(GlobalWindow window, OnMergeContext ctx) { } }NeverTrigger的onElement、onEventTime、onProcessingTime均返回TriggerResult.CONTINUE；GlobalWindows默认使用的是NeverTriggerCountTriggerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/triggers/CountTrigger.java@PublicEvolvingpublic class CountTrigger<W extends Window> extends Trigger<Object, W> { private static final long serialVersionUID = 1L; private final long maxCount; private final ReducingStateDescriptor<Long> stateDesc = new ReducingStateDescriptor<>(“count”, new Sum(), LongSerializer.INSTANCE); private CountTrigger(long maxCount) { this.maxCount = maxCount; } @Override public TriggerResult onElement(Object element, long timestamp, W window, TriggerContext ctx) throws Exception { ReducingState<Long> count = ctx.getPartitionedState(stateDesc); count.add(1L); if (count.get() >= maxCount) { count.clear(); return TriggerResult.FIRE; } return TriggerResult.CONTINUE; } @Override public TriggerResult onEventTime(long time, W window, TriggerContext ctx) { return TriggerResult.CONTINUE; } @Override public TriggerResult onProcessingTime(long time, W window, TriggerContext ctx) throws Exception { return TriggerResult.CONTINUE; } @Override public void clear(W window, TriggerContext ctx) throws Exception { ctx.getPartitionedState(stateDesc).clear(); } @Override public boolean canMerge() { return true; } @Override public void onMerge(W window, OnMergeContext ctx) throws Exception { ctx.mergePartitionedState(stateDesc); } @Override public String toString() { return “CountTrigger(” + maxCount + “)”; } public static <W extends Window> CountTrigger<W> of(long maxCount) { return new CountTrigger<>(maxCount); } private static class Sum implements ReduceFunction<Long> { private static final long serialVersionUID = 1L; @Override public Long reduce(Long value1, Long value2) throws Exception { return value1 + value2; } }}CountTrigger继承了Trigger，指定了element类型为Object类型；它定义了maxCount及ReducingStateDescriptor；其中ReducingStateDescriptor用于窗口计数(它使用的是自己定义的Sum函数)，在onElement方法里头，当计数大于等于maxCount时，则会清空计数，然后返回TriggerResult.FIRE，否则返回TriggerResult.CONTINUE；onEventTime、onProcessingTime均返回TriggerResult.CONTINUE；canMerge返回true；onMerge执行的是ctx.mergePartitionedState(stateDesc)；clear执行的是ctx.getPartitionedState(stateDesc).clear()PurgingTriggerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/triggers/PurgingTrigger.java@PublicEvolvingpublic class PurgingTrigger<T, W extends Window> extends Trigger<T, W> { private static final long serialVersionUID = 1L; private Trigger<T, W> nestedTrigger; private PurgingTrigger(Trigger<T, W> nestedTrigger) { this.nestedTrigger = nestedTrigger; } @Override public TriggerResult onElement(T element, long timestamp, W window, TriggerContext ctx) throws Exception { TriggerResult triggerResult = nestedTrigger.onElement(element, timestamp, window, ctx); return triggerResult.isFire() ? TriggerResult.FIRE_AND_PURGE : triggerResult; } @Override public TriggerResult onEventTime(long time, W window, TriggerContext ctx) throws Exception { TriggerResult triggerResult = nestedTrigger.onEventTime(time, window, ctx); return triggerResult.isFire() ? TriggerResult.FIRE_AND_PURGE : triggerResult; } @Override public TriggerResult onProcessingTime(long time, W window, TriggerContext ctx) throws Exception { TriggerResult triggerResult = nestedTrigger.onProcessingTime(time, window, ctx); return triggerResult.isFire() ? TriggerResult.FIRE_AND_PURGE : triggerResult; } @Override public void clear(W window, TriggerContext ctx) throws Exception { nestedTrigger.clear(window, ctx); } @Override public boolean canMerge() { return nestedTrigger.canMerge(); } @Override public void onMerge(W window, OnMergeContext ctx) throws Exception { nestedTrigger.onMerge(window, ctx); } @Override public String toString() { return “PurgingTrigger(” + nestedTrigger.toString() + “)”; } public static <T, W extends Window> PurgingTrigger<T, W> of(Trigger<T, W> nestedTrigger) { return new PurgingTrigger<>(nestedTrigger); } @VisibleForTesting public Trigger<T, W> getNestedTrigger() { return nestedTrigger; }}PurgingTrigger是包装型的Trigger，它包装了nestedTrigger，其onElement、onEventTime、onProcessingTime根据nestedTrigger的返回结果，在triggerResult.isFire()为true的时候，包装返回TriggerResult.FIRE_AND_PURGE；canMerge、onMerge、clear等方法均是委托给nestedTrigger处理小结Trigger接收两个泛型，一个是element类型，一个是窗口类型；它定义了onElement、onProcessingTime、onEventTime、canMerge、onMerge、clear几个方法，其中onElement、onProcessingTime、onEventTime均需要返回TriggerResult；TriggerResult用于表示trigger在onElement、onProcessingTime、onEventTime被回调时返回的action枚举，它有fire、purge两个属性(fire表示是否要触发window的computation操作；而purge表示是否要清理window的窗口数据)，CONTINUE、FIRE_AND_PURGE、FIRE、PURGE五个枚举SlidingEventTimeWindows、TumblingEventTimeWindows、EventTimeSessionWindows、DynamicEventTimeSessionWindows默认都使用EventTimeTrigger；SlidingProcessingTimeWindows、TumblingProcessingTimeWindows、ProcessingTimeSessionWindows、DynamicProcessingTimeSessionWindows默认都使用ProcessingTimeTrigger；GlobalWindows默认使用的是NeverTriggerCountTrigger主要用于计数的窗口类型，它使用ReducingStateDescriptor来进行窗口计数，在onElement方法里头，当计数大于等于maxCount时，则会清空计数，然后返回TriggerResult.FIRE，否则返回TriggerResult.CONTINUE；PurgingTrigger是包装型的Trigger，它包装了nestedTrigger，其onElement、onEventTime、onProcessingTime根据nestedTrigger的返回结果，在triggerResult.isFire()为true的时候，包装返回TriggerResult.FIRE_AND_PURGE；canMerge、onMerge、clear等方法均是委托给nestedTrigger处理docTriggers ...

聊聊flink的Global Window

序本文主要研究一下flink的Global WindowGlobalWindowflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/windows/GlobalWindow.java@PublicEvolvingpublic class GlobalWindow extends Window { private static final GlobalWindow INSTANCE = new GlobalWindow(); private GlobalWindow() { } public static GlobalWindow get() { return INSTANCE; } @Override public long maxTimestamp() { return Long.MAX_VALUE; } @Override public boolean equals(Object o) { return this == o || !(o == null || getClass() != o.getClass()); } @Override public int hashCode() { return 0; } @Override public String toString() { return “GlobalWindow”; } /** * A {@link TypeSerializer} for {@link GlobalWindow}. / public static class Serializer extends TypeSerializerSingleton<GlobalWindow> { private static final long serialVersionUID = 1L; @Override public boolean isImmutableType() { return true; } @Override public GlobalWindow createInstance() { return GlobalWindow.INSTANCE; } @Override public GlobalWindow copy(GlobalWindow from) { return from; } @Override public GlobalWindow copy(GlobalWindow from, GlobalWindow reuse) { return from; } @Override public int getLength() { return 0; } @Override public void serialize(GlobalWindow record, DataOutputView target) throws IOException { target.writeByte(0); } @Override public GlobalWindow deserialize(DataInputView source) throws IOException { source.readByte(); return GlobalWindow.INSTANCE; } @Override public GlobalWindow deserialize(GlobalWindow reuse, DataInputView source) throws IOException { source.readByte(); return GlobalWindow.INSTANCE; } @Override public void copy(DataInputView source, DataOutputView target) throws IOException { source.readByte(); target.writeByte(0); } @Override public boolean canEqual(Object obj) { return obj instanceof Serializer; } }}GlobalWindow继承了Window，它的maxTimestamp方法与TimeWindow不同，TimeWindow有start和end属性，其maxTimestamp方法返回的是end-1；而GlobalWindow的maxTimestamp方法返回的是Long.MAX_VALUE；GlobalWindow定义了自己的SerializerGlobalWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/GlobalWindows.java@PublicEvolvingpublic class GlobalWindows extends WindowAssigner<Object, GlobalWindow> { private static final long serialVersionUID = 1L; private GlobalWindows() {} @Override public Collection<GlobalWindow> assignWindows(Object element, long timestamp, WindowAssignerContext context) { return Collections.singletonList(GlobalWindow.get()); } @Override public Trigger<Object, GlobalWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return new NeverTrigger(); } @Override public String toString() { return “GlobalWindows()”; } /* * Creates a new {@code GlobalWindows} {@link WindowAssigner} that assigns * all elements to the same {@link GlobalWindow}. * * @return The global window policy. / public static GlobalWindows create() { return new GlobalWindows(); } /* * A trigger that never fires, as default Trigger for GlobalWindows. */ @Internal public static class NeverTrigger extends Trigger<Object, GlobalWindow> { private static final long serialVersionUID = 1L; @Override public TriggerResult onElement(Object element, long timestamp, GlobalWindow window, TriggerContext ctx) { return TriggerResult.CONTINUE; } @Override public TriggerResult onEventTime(long time, GlobalWindow window, TriggerContext ctx) { return TriggerResult.CONTINUE; } @Override public TriggerResult onProcessingTime(long time, GlobalWindow window, TriggerContext ctx) { return TriggerResult.CONTINUE; } @Override public void clear(GlobalWindow window, TriggerContext ctx) throws Exception {} @Override public void onMerge(GlobalWindow window, OnMergeContext ctx) { } } @Override public TypeSerializer<GlobalWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new GlobalWindow.Serializer(); } @Override public boolean isEventTime() { return false; }}GlobalWindows继承了WindowAssigner，key类型为Object，窗口类型为GlobalWindowassignWindows方法返回的是GlobalWindow；getDefaultTrigger方法返回的是NeverTrigger；getWindowSerializer返回的是GlobalWindow.Serializer()；isEventTime返回的为falseNeverTrigger继承了Trigger，其onElement、onProcessingTime、onProcessingTime返回的TriggerResult均为TriggerResult.CONTINUE小结GlobalWindows继承了WindowAssigner，key类型为Object，窗口类型为GlobalWindow；GlobalWindow继承了Window，它的maxTimestamp方法与TimeWindow不同，TimeWindow有start和end属性，其maxTimestamp方法返回的是end-1；而GlobalWindow的maxTimestamp方法返回的是Long.MAX_VALUE；GlobalWindow定义了自己的SerializerGlobalWindows的assignWindows方法返回的是GlobalWindow；getDefaultTrigger方法返回的是NeverTrigger；getWindowSerializer返回的是GlobalWindow.Serializer()；isEventTime返回的为falseNeverTrigger继承了Trigger，其onElement、onProcessingTime、onProcessingTime返回的TriggerResult均为TriggerResult.CONTINUE；该行为就是不做任何触发操作；如果需要触发操作，则需要在定义window操作时设置自定义的trigger，覆盖GlobalWindows默认的NeverTriggerdocGlobal Windows ...

聊聊flink的Session Window

序本文主要研究一下flink的Session WindowMergingWindowAssignerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/MergingWindowAssigner.java@PublicEvolvingpublic abstract class MergingWindowAssigner<T, W extends Window> extends WindowAssigner<T, W> { private static final long serialVersionUID = 1L; /** * Determines which windows (if any) should be merged. * * @param windows The window candidates. * @param callback A callback that can be invoked to signal which windows should be merged. / public abstract void mergeWindows(Collection<W> windows, MergeCallback<W> callback); /* * Callback to be used in {@link #mergeWindows(Collection, MergeCallback)} for specifying which * windows should be merged. / public interface MergeCallback<W> { /* * Specifies that the given windows should be merged into the result window. * * @param toBeMerged The list of windows that should be merged into one window. * @param mergeResult The resulting merged window. / void merge(Collection<W> toBeMerged, W mergeResult); }}MergingWindowAssigner继承了WindowAssigner，它自己定义了mergeWindows抽象方法，该方法有一个MergeCallback类型参数，MergeCallback接口定义了merge方法，传入merge前的windows及合并后的windowEventTimeSessionWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/EventTimeSessionWindows.javapublic class EventTimeSessionWindows extends MergingWindowAssigner<Object, TimeWindow> { private static final long serialVersionUID = 1L; protected long sessionTimeout; protected EventTimeSessionWindows(long sessionTimeout) { if (sessionTimeout <= 0) { throw new IllegalArgumentException(“EventTimeSessionWindows parameters must satisfy 0 < size”); } this.sessionTimeout = sessionTimeout; } @Override public Collection<TimeWindow> assignWindows(Object element, long timestamp, WindowAssignerContext context) { return Collections.singletonList(new TimeWindow(timestamp, timestamp + sessionTimeout)); } @Override public Trigger<Object, TimeWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return EventTimeTrigger.create(); } @Override public String toString() { return “EventTimeSessionWindows(” + sessionTimeout + “)”; } /* * Creates a new {@code SessionWindows} {@link WindowAssigner} that assigns * elements to sessions based on the element timestamp. * * @param size The session timeout, i.e. the time gap between sessions * @return The policy. / public static EventTimeSessionWindows withGap(Time size) { return new EventTimeSessionWindows(size.toMilliseconds()); } /* * Creates a new {@code SessionWindows} {@link WindowAssigner} that assigns * elements to sessions based on the element timestamp. * * @param sessionWindowTimeGapExtractor The extractor to use to extract the time gap from the input elements * @return The policy. / @PublicEvolving public static <T> DynamicEventTimeSessionWindows<T> withDynamicGap(SessionWindowTimeGapExtractor<T> sessionWindowTimeGapExtractor) { return new DynamicEventTimeSessionWindows<>(sessionWindowTimeGapExtractor); } @Override public TypeSerializer<TimeWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new TimeWindow.Serializer(); } @Override public boolean isEventTime() { return true; } /* * Merge overlapping {@link TimeWindow}s. / public void mergeWindows(Collection<TimeWindow> windows, MergingWindowAssigner.MergeCallback<TimeWindow> c) { TimeWindow.mergeWindows(windows, c); }}EventTimeSessionWindows继承了MergingWindowAssigner，它的构造器参数为sessionTimeout；assignWindows方法返回的TimeWindow的start为timestamp，end为timestamp + sessionTimeoutgetDefaultTrigger方法返回的是EventTimeTrigger；getWindowSerializer返回的是TimeWindow.Serializer()；isEventTime返回的为true；mergeWindows方法调用的是TimeWindow.mergeWindows方法EventTimeSessionWindows定义了两个静态工厂方法，分别是withGap及withDynamicGap，其中withGap创建的是EventTimeSessionWindows，withDynamicGap创建的是DynamicEventTimeSessionWindowsProcessingTimeSessionWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/ProcessingTimeSessionWindows.javapublic class ProcessingTimeSessionWindows extends MergingWindowAssigner<Object, TimeWindow> { private static final long serialVersionUID = 1L; protected long sessionTimeout; protected ProcessingTimeSessionWindows(long sessionTimeout) { if (sessionTimeout <= 0) { throw new IllegalArgumentException(“ProcessingTimeSessionWindows parameters must satisfy 0 < size”); } this.sessionTimeout = sessionTimeout; } @Override public Collection<TimeWindow> assignWindows(Object element, long timestamp, WindowAssignerContext context) { long currentProcessingTime = context.getCurrentProcessingTime(); return Collections.singletonList(new TimeWindow(currentProcessingTime, currentProcessingTime + sessionTimeout)); } @Override public Trigger<Object, TimeWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return ProcessingTimeTrigger.create(); } @Override public String toString() { return “ProcessingTimeSessionWindows(” + sessionTimeout + “)”; } /* * Creates a new {@code SessionWindows} {@link WindowAssigner} that assigns * elements to sessions based on the element timestamp. * * @param size The session timeout, i.e. the time gap between sessions * @return The policy. / public static ProcessingTimeSessionWindows withGap(Time size) { return new ProcessingTimeSessionWindows(size.toMilliseconds()); } /* * Creates a new {@code SessionWindows} {@link WindowAssigner} that assigns * elements to sessions based on the element timestamp. * * @param sessionWindowTimeGapExtractor The extractor to use to extract the time gap from the input elements * @return The policy. / @PublicEvolving public static <T> DynamicProcessingTimeSessionWindows<T> withDynamicGap(SessionWindowTimeGapExtractor<T> sessionWindowTimeGapExtractor) { return new DynamicProcessingTimeSessionWindows<>(sessionWindowTimeGapExtractor); } @Override public TypeSerializer<TimeWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new TimeWindow.Serializer(); } @Override public boolean isEventTime() { return false; } /* * Merge overlapping {@link TimeWindow}s. / public void mergeWindows(Collection<TimeWindow> windows, MergeCallback<TimeWindow> c) { TimeWindow.mergeWindows(windows, c); }}ProcessingTimeSessionWindows继承了MergingWindowAssigner，它的构造器参数为sessionTimeout；assignWindows方法返回的TimeWindow的start为currentProcessingTime(这里currentProcessingTime值为context.getCurrentProcessingTime())，end为currentProcessingTime + sessionTimeoutgetDefaultTrigger方法返回的是ProcessingTimeTrigger；getWindowSerializer返回的是TimeWindow.Serializer()；isEventTime返回的为false；mergeWindows方法调用的是TimeWindow.mergeWindows方法ProcessingTimeSessionWindows定义了两个静态工厂方法，分别是withGap及withDynamicGap，其中withGap创建的是ProcessingTimeSessionWindows，withDynamicGap创建的是DynamicProcessingTimeSessionWindowsSessionWindowTimeGapExtractorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/SessionWindowTimeGapExtractor.java@PublicEvolvingpublic interface SessionWindowTimeGapExtractor<T> extends Serializable { /* * Extracts the session time gap. * @param element The input element. * @return The session time gap in milliseconds. / long extract(T element);}SessionWindowTimeGapExtractor接口定义了extract方法，用于从element中提取sessionTimeout参数DynamicEventTimeSessionWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/DynamicEventTimeSessionWindows.java@PublicEvolvingpublic class DynamicEventTimeSessionWindows<T> extends MergingWindowAssigner<T, TimeWindow> { private static final long serialVersionUID = 1L; protected SessionWindowTimeGapExtractor<T> sessionWindowTimeGapExtractor; protected DynamicEventTimeSessionWindows(SessionWindowTimeGapExtractor<T> sessionWindowTimeGapExtractor) { this.sessionWindowTimeGapExtractor = sessionWindowTimeGapExtractor; } @Override public Collection<TimeWindow> assignWindows(T element, long timestamp, WindowAssignerContext context) { long sessionTimeout = sessionWindowTimeGapExtractor.extract(element); if (sessionTimeout <= 0) { throw new IllegalArgumentException(“Dynamic session time gap must satisfy 0 < gap”); } return Collections.singletonList(new TimeWindow(timestamp, timestamp + sessionTimeout)); } @SuppressWarnings(“unchecked”) @Override public Trigger<T, TimeWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return (Trigger<T, TimeWindow>) EventTimeTrigger.create(); } @Override public String toString() { return “DynamicEventTimeSessionWindows()”; } /* * Creates a new {@code SessionWindows} {@link WindowAssigner} that assigns * elements to sessions based on the element timestamp. * * @param sessionWindowTimeGapExtractor The extractor to use to extract the time gap from the input elements * @return The policy. / public static <T> DynamicEventTimeSessionWindows<T> withDynamicGap(SessionWindowTimeGapExtractor<T> sessionWindowTimeGapExtractor) { return new DynamicEventTimeSessionWindows<>(sessionWindowTimeGapExtractor); } @Override public TypeSerializer<TimeWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new TimeWindow.Serializer(); } @Override public boolean isEventTime() { return true; } /* * Merge overlapping {@link TimeWindow}s. / public void mergeWindows(Collection<TimeWindow> windows, MergeCallback<TimeWindow> c) { TimeWindow.mergeWindows(windows, c); }}DynamicEventTimeSessionWindows也继承了MergingWindowAssigner，与EventTimeSessionWindows不同的是，它的构造器参数为SessionWindowTimeGapExtractorassignWindows方法首先使用sessionWindowTimeGapExtractor从element中提取sessionTimeout，然后返回TimeWindow(timestamp, timestamp + sessionTimeout)；getDefaultTrigger方法返回的是EventTimeTrigger；isEventTime返回的为true；mergeWindows方法调用的是TimeWindow.mergeWindows方法DynamicEventTimeSessionWindows定义了withDynamicGap的静态工厂方法，用于创建DynamicEventTimeSessionWindowsDynamicProcessingTimeSessionWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/DynamicProcessingTimeSessionWindows.java@PublicEvolvingpublic class DynamicProcessingTimeSessionWindows<T> extends MergingWindowAssigner<T, TimeWindow> { private static final long serialVersionUID = 1L; protected SessionWindowTimeGapExtractor<T> sessionWindowTimeGapExtractor; protected DynamicProcessingTimeSessionWindows(SessionWindowTimeGapExtractor<T> sessionWindowTimeGapExtractor) { this.sessionWindowTimeGapExtractor = sessionWindowTimeGapExtractor; } @Override public Collection<TimeWindow> assignWindows(T element, long timestamp, WindowAssignerContext context) { long currentProcessingTime = context.getCurrentProcessingTime(); long sessionTimeout = sessionWindowTimeGapExtractor.extract(element); if (sessionTimeout <= 0) { throw new IllegalArgumentException(“Dynamic session time gap must satisfy 0 < gap”); } return Collections.singletonList(new TimeWindow(currentProcessingTime, currentProcessingTime + sessionTimeout)); } @SuppressWarnings(“unchecked”) @Override public Trigger<T, TimeWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return (Trigger<T, TimeWindow>) ProcessingTimeTrigger.create(); } @Override public String toString() { return “DynamicProcessingTimeSessionWindows()”; } /* * Creates a new {@code SessionWindows} {@link WindowAssigner} that assigns * elements to sessions based on the element timestamp. * * @param sessionWindowTimeGapExtractor The extractor to use to extract the time gap from the input elements * @return The policy. / public static <T> DynamicProcessingTimeSessionWindows<T> withDynamicGap(SessionWindowTimeGapExtractor<T> sessionWindowTimeGapExtractor) { return new DynamicProcessingTimeSessionWindows<>(sessionWindowTimeGapExtractor); } @Override public TypeSerializer<TimeWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new TimeWindow.Serializer(); } @Override public boolean isEventTime() { return false; } /* * Merge overlapping {@link TimeWindow}s. */ public void mergeWindows(Collection<TimeWindow> windows, MergeCallback<TimeWindow> c) { TimeWindow.mergeWindows(windows, c); }}DynamicProcessingTimeSessionWindows也继承了MergingWindowAssigner，与ProcessingTimeSessionWindows不同的是，它的构造器参数为SessionWindowTimeGapExtractorassignWindows方法首先使用sessionWindowTimeGapExtractor从element中提取sessionTimeout，然后返回TimeWindow(currentProcessingTime, currentProcessingTime + sessionTimeout)(这里currentProcessingTime的值为context.getCurrentProcessingTime())；getDefaultTrigger方法返回的是ProcessingTimeTrigger；isEventTime返回的为false；mergeWindows方法调用的是TimeWindow.mergeWindows方法DynamicProcessingTimeSessionWindows定义了withDynamicGap的静态工厂方法，用于创建DynamicProcessingTimeSessionWindows小结flink的session window主要有EventTimeSessionWindows、DynamicEventTimeSessionWindows、ProcessingTimeSessionWindows、DynamicProcessingTimeSessionWindows，它们都继承了MergingWindowAssigner；MergingWindowAssigner定义了mergeWindows抽象方法EventTimeSessionWindows与ProcessingTimeSessionWindows的构造器参数都是sessionTimeout，不同的是，assignWindows中，ProcessingTimeSessionWindows使用context.getCurrentProcessingTime()替代了方法timestamp参数来计算TimeWindow；getDefaultTrigger方法前者返回EventTimeTrigger，后者返回ProcessingTimeTrigger；isEventTime方法前者返回true，后者返回falseDynamicEventTimeSessionWindows与DynamicProcessingTimeSessionWindows，它们与非dynamic的区别是，它们的构造器参数为SessionWindowTimeGapExtractor；SessionWindowTimeGapExtractor接口定义了extract方法，用于从element中提取sessionTimeout参数；而非dynamic的session window，其sessionTimeout参数在构造器传入之后就固定了docSession Windows ...

聊聊flink的Sliding Window

序本文主要研究一下flink的Sliding WindowSlidingEventTimeWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/SlidingEventTimeWindows.java@PublicEvolvingpublic class SlidingEventTimeWindows extends WindowAssigner<Object, TimeWindow> { private static final long serialVersionUID = 1L; private final long size; private final long slide; private final long offset; protected SlidingEventTimeWindows(long size, long slide, long offset) { if (offset < 0 || offset >= slide || size <= 0) { throw new IllegalArgumentException(“SlidingEventTimeWindows parameters must satisfy 0 <= offset < slide and size > 0”); } this.size = size; this.slide = slide; this.offset = offset; } @Override public Collection<TimeWindow> assignWindows(Object element, long timestamp, WindowAssignerContext context) { if (timestamp > Long.MIN_VALUE) { List<TimeWindow> windows = new ArrayList<>((int) (size / slide)); long lastStart = TimeWindow.getWindowStartWithOffset(timestamp, offset, slide); for (long start = lastStart; start > timestamp - size; start -= slide) { windows.add(new TimeWindow(start, start + size)); } return windows; } else { throw new RuntimeException(“Record has Long.MIN_VALUE timestamp (= no timestamp marker). " + “Is the time characteristic set to ‘ProcessingTime’, or did you forget to call " + “‘DataStream.assignTimestampsAndWatermarks(…)’?”); } } public long getSize() { return size; } public long getSlide() { return slide; } @Override public Trigger<Object, TimeWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return EventTimeTrigger.create(); } @Override public String toString() { return “SlidingEventTimeWindows(” + size + “, " + slide + “)”; } public static SlidingEventTimeWindows of(Time size, Time slide) { return new SlidingEventTimeWindows(size.toMilliseconds(), slide.toMilliseconds(), 0); } public static SlidingEventTimeWindows of(Time size, Time slide, Time offset) { return new SlidingEventTimeWindows(size.toMilliseconds(), slide.toMilliseconds(), offset.toMilliseconds() % slide.toMilliseconds()); } @Override public TypeSerializer<TimeWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new TimeWindow.Serializer(); } @Override public boolean isEventTime() { return true; }}SlidingEventTimeWindows继承了Window，其中元素类型为Object，而窗口类型为TimeWindow；它有三个参数，一个是size，一个是slide，一个是offset，其中offset必须大于等于0，offset必须大于slide，size必须大于0assignWindows方法以slide作为size通过TimeWindow.getWindowStartWithOffset(timestamp, offset, slide)计算lastStart，然后以为start + size > timestamp为循环条件，每次对start减去slide，挨个计算TimeWindow(start, start + size)；getDefaultTrigger方法返回的是EventTimeTrigger；getWindowSerializer方法返回的是TimeWindow.Serializer()；isEventTime返回的为trueSlidingEventTimeWindows提供了of静态工厂方法，可以指定size、slide及offset参数，它对于传入的offset参数转为毫秒然后与slide.toMilliseconds()取余作为最后的offset值SlidingProcessingTimeWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/SlidingProcessingTimeWindows.javapublic class SlidingProcessingTimeWindows extends WindowAssigner<Object, TimeWindow> { private static final long serialVersionUID = 1L; private final long size; private final long offset; private final long slide; private SlidingProcessingTimeWindows(long size, long slide, long offset) { if (offset < 0 || offset >= slide || size <= 0) { throw new IllegalArgumentException(“SlidingProcessingTimeWindows parameters must satisfy 0 <= offset < slide and size > 0”); } this.size = size; this.slide = slide; this.offset = offset; } @Override public Collection<TimeWindow> assignWindows(Object element, long timestamp, WindowAssignerContext context) { timestamp = context.getCurrentProcessingTime(); List<TimeWindow> windows = new ArrayList<>((int) (size / slide)); long lastStart = TimeWindow.getWindowStartWithOffset(timestamp, offset, slide); for (long start = lastStart; start > timestamp - size; start -= slide) { windows.add(new TimeWindow(start, start + size)); } return windows; } public long getSize() { return size; } public long getSlide() { return slide; } @Override public Trigger<Object, TimeWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return ProcessingTimeTrigger.create(); } @Override public String toString() { return “SlidingProcessingTimeWindows(” + size + “, " + slide + “)”; } public static SlidingProcessingTimeWindows of(Time size, Time slide) { return new SlidingProcessingTimeWindows(size.toMilliseconds(), slide.toMilliseconds(), 0); } public static SlidingProcessingTimeWindows of(Time size, Time slide, Time offset) { return new SlidingProcessingTimeWindows(size.toMilliseconds(), slide.toMilliseconds(), offset.toMilliseconds() % slide.toMilliseconds()); } @Override public TypeSerializer<TimeWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new TimeWindow.Serializer(); } @Override public boolean isEventTime() { return false; }}SlidingProcessingTimeWindows继承了Window，其中元素类型为Object，而窗口类型为TimeWindow；它有三个参数，一个是size，一个是slide，一个是offset，其中offset必须大于等于0，offset必须大于slide，size必须大于0assignWindows方法以slide作为size通过TimeWindow.getWindowStartWithOffset(timestamp, offset, slide)计算lastStart(与SlidingEventTimeWindows不同的是SlidingProcessingTimeWindows的这个方法里头使用context.getCurrentProcessingTime()值重置了timestamp)，然后以为start + size > timestamp为循环条件，每次对start减去slide，挨个计算TimeWindow(start, start + size)；getDefaultTrigger方法返回的是ProcessingTimeTrigger；getWindowSerializer方法返回的是TimeWindow.Serializer()；isEventTime返回的为falseSlidingEventTimeWindows提供了of静态工厂方法，可以指定size、slide及offset参数，它对于传入的offset参数转为毫秒然后与slide.toMilliseconds()取余作为最后的offset值小结flink的Sliding Window分为SlidingEventTimeWindows及SlidingProcessingTimeWindows，它们都继承了WindowAssigner，其中元素类型为Object，而窗口类型为TimeWindow；它有三个参数，一个是size，一个是slide，一个是offset，其中offset必须大于等于0，offset必须大于slide，size必须大于0WindowAssigner定义了assignWindows、getDefaultTrigger、getWindowSerializer、isEventTime这几个抽象方法，同时定义了抽象静态类WindowAssignerContext；它有两个泛型，其中T为元素类型，而W为窗口类型；SlidingEventTimeWindows及SlidingProcessingTimeWindows的窗口类型为TimeWindow，它有start及end属性，其中start为inclusive，而end为exclusive，maxTimestamp返回的是end-1，它还提供了mergeWindows及getWindowStartWithOffset静态方法；前者用于合并重叠的时间窗口，后者用于获取指定timestamp、offset、windowSize的window startSlidingEventTimeWindows及SlidingProcessingTimeWindows的不同在于assignWindows、getDefaultTrigger、isEventTime方法；前者assignWindows使用的是参数中的timestamp，而后者使用的是context.getCurrentProcessingTime()；前者的getDefaultTrigger返回的是EventTimeTrigger，而后者返回的是ProcessingTimeTrigger；前者isEventTime方法返回的为true，而后者返回的为falsedocSliding Windows ...

聊聊flink的Tumbling Window

序本文主要研究一下flink的Tumbling WindowWindowAssignerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/WindowAssigner.java@PublicEvolvingpublic abstract class WindowAssigner<T, W extends Window> implements Serializable { private static final long serialVersionUID = 1L; /** * Returns a {@code Collection} of windows that should be assigned to the element. * * @param element The element to which windows should be assigned. * @param timestamp The timestamp of the element. * @param context The {@link WindowAssignerContext} in which the assigner operates. / public abstract Collection<W> assignWindows(T element, long timestamp, WindowAssignerContext context); /* * Returns the default trigger associated with this {@code WindowAssigner}. / public abstract Trigger<T, W> getDefaultTrigger(StreamExecutionEnvironment env); /* * Returns a {@link TypeSerializer} for serializing windows that are assigned by * this {@code WindowAssigner}. / public abstract TypeSerializer<W> getWindowSerializer(ExecutionConfig executionConfig); /* * Returns {@code true} if elements are assigned to windows based on event time, * {@code false} otherwise. / public abstract boolean isEventTime(); /* * A context provided to the {@link WindowAssigner} that allows it to query the * current processing time. * * This is provided to the assigner by its containing * {@link org.apache.flink.streaming.runtime.operators.windowing.WindowOperator}, * which, in turn, gets it from the containing * {@link org.apache.flink.streaming.runtime.tasks.StreamTask}. / public abstract static class WindowAssignerContext { /* * Returns the current processing time. / public abstract long getCurrentProcessingTime(); }}WindowAssigner定义了assignWindows、getDefaultTrigger、getWindowSerializer、isEventTime这几个抽象方法，同时定义了抽象静态类WindowAssignerContext；它有两个泛型，其中T为元素类型，而W为窗口类型Windowflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/windows/Window.java@PublicEvolvingpublic abstract class Window { /* * Gets the largest timestamp that still belongs to this window. * * @return The largest timestamp that still belongs to this window. / public abstract long maxTimestamp();}Window对象代表把无限流数据划分为有限buckets的集合，它有一个maxTimestamp，代表该窗口数据在该时间点内到达；它有两个子类，一个是GlobalWindow，一个是TimeWindowTimeWindowflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/windows/TimeWindow.java@PublicEvolvingpublic class TimeWindow extends Window { private final long start; private final long end; public TimeWindow(long start, long end) { this.start = start; this.end = end; } /* * Gets the starting timestamp of the window. This is the first timestamp that belongs * to this window. * * @return The starting timestamp of this window. / public long getStart() { return start; } /* * Gets the end timestamp of this window. The end timestamp is exclusive, meaning it * is the first timestamp that does not belong to this window any more. * * @return The exclusive end timestamp of this window. / public long getEnd() { return end; } /* * Gets the largest timestamp that still belongs to this window. * * This timestamp is identical to {@code getEnd() - 1}. * * @return The largest timestamp that still belongs to this window. * * @see #getEnd() / @Override public long maxTimestamp() { return end - 1; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } TimeWindow window = (TimeWindow) o; return end == window.end && start == window.start; } @Override public int hashCode() { return MathUtils.longToIntWithBitMixing(start + end); } @Override public String toString() { return “TimeWindow{” + “start=” + start + “, end=” + end + ‘}’; } /* * Returns {@code true} if this window intersects the given window. / public boolean intersects(TimeWindow other) { return this.start <= other.end && this.end >= other.start; } /* * Returns the minimal window covers both this window and the given window. / public TimeWindow cover(TimeWindow other) { return new TimeWindow(Math.min(start, other.start), Math.max(end, other.end)); } // ———————————————————————— // Serializer // ———————————————————————— //…… // ———————————————————————— // Utilities // ———————————————————————— /* * Merge overlapping {@link TimeWindow}s. For use by merging * {@link org.apache.flink.streaming.api.windowing.assigners.WindowAssigner WindowAssigners}. / public static void mergeWindows(Collection<TimeWindow> windows, MergingWindowAssigner.MergeCallback<TimeWindow> c) { // sort the windows by the start time and then merge overlapping windows List<TimeWindow> sortedWindows = new ArrayList<>(windows); Collections.sort(sortedWindows, new Comparator<TimeWindow>() { @Override public int compare(TimeWindow o1, TimeWindow o2) { return Long.compare(o1.getStart(), o2.getStart()); } }); List<Tuple2<TimeWindow, Set<TimeWindow>>> merged = new ArrayList<>(); Tuple2<TimeWindow, Set<TimeWindow>> currentMerge = null; for (TimeWindow candidate: sortedWindows) { if (currentMerge == null) { currentMerge = new Tuple2<>(); currentMerge.f0 = candidate; currentMerge.f1 = new HashSet<>(); currentMerge.f1.add(candidate); } else if (currentMerge.f0.intersects(candidate)) { currentMerge.f0 = currentMerge.f0.cover(candidate); currentMerge.f1.add(candidate); } else { merged.add(currentMerge); currentMerge = new Tuple2<>(); currentMerge.f0 = candidate; currentMerge.f1 = new HashSet<>(); currentMerge.f1.add(candidate); } } if (currentMerge != null) { merged.add(currentMerge); } for (Tuple2<TimeWindow, Set<TimeWindow>> m: merged) { if (m.f1.size() > 1) { c.merge(m.f1, m.f0); } } } /* * Method to get the window start for a timestamp. * * @param timestamp epoch millisecond to get the window start. * @param offset The offset which window start would be shifted by. * @param windowSize The size of the generated windows. * @return window start */ public static long getWindowStartWithOffset(long timestamp, long offset, long windowSize) { return timestamp - (timestamp - offset + windowSize) % windowSize; }}TimeWindow有start及end属性，其中start为inclusive，而end为exclusive，所以maxTimestamp返回的是end-1；这里重写了equals及hashcode方法TimeWindow提供了intersects方法用于表示本窗口与指定窗口是否有交叉；而cover方法用于返回本窗口与指定窗口的重叠窗口TimeWindow还提供了mergeWindows及getWindowStartWithOffset静态方法；前者用于合并重叠的时间窗口，后者用于获取指定timestamp、offset、windowSize的window startTumblingEventTimeWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/TumblingEventTimeWindows.java@PublicEvolvingpublic class TumblingEventTimeWindows extends WindowAssigner<Object, TimeWindow> { private static final long serialVersionUID = 1L; private final long size; private final long offset; protected TumblingEventTimeWindows(long size, long offset) { if (offset < 0 || offset >= size) { throw new IllegalArgumentException(“TumblingEventTimeWindows parameters must satisfy 0 <= offset < size”); } this.size = size; this.offset = offset; } @Override public Collection<TimeWindow> assignWindows(Object element, long timestamp, WindowAssignerContext context) { if (timestamp > Long.MIN_VALUE) { // Long.MIN_VALUE is currently assigned when no timestamp is present long start = TimeWindow.getWindowStartWithOffset(timestamp, offset, size); return Collections.singletonList(new TimeWindow(start, start + size)); } else { throw new RuntimeException(“Record has Long.MIN_VALUE timestamp (= no timestamp marker). " + “Is the time characteristic set to ‘ProcessingTime’, or did you forget to call " + “‘DataStream.assignTimestampsAndWatermarks(…)’?”); } } @Override public Trigger<Object, TimeWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return EventTimeTrigger.create(); } @Override public String toString() { return “TumblingEventTimeWindows(” + size + “)”; } public static TumblingEventTimeWindows of(Time size) { return new TumblingEventTimeWindows(size.toMilliseconds(), 0); } public static TumblingEventTimeWindows of(Time size, Time offset) { return new TumblingEventTimeWindows(size.toMilliseconds(), offset.toMilliseconds()); } @Override public TypeSerializer<TimeWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new TimeWindow.Serializer(); } @Override public boolean isEventTime() { return true; }}TumblingEventTimeWindows继承了Window，其中元素类型为Object，而窗口类型为TimeWindow；它有两个参数，一个是size，一个是offset，其中offset必须大于等于0，size必须大于offsetassignWindows方法获取的窗口为start及start+size，而start=TimeWindow.getWindowStartWithOffset(timestamp, offset, size)；getDefaultTrigger方法返回的是EventTimeTrigger；getWindowSerializer方法返回的是TimeWindow.Serializer()；isEventTime返回trueTumblingEventTimeWindows提供了of静态工厂方法，可以指定size及offset参数TumblingProcessingTimeWindowsflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/assigners/TumblingProcessingTimeWindows.javapublic class TumblingProcessingTimeWindows extends WindowAssigner<Object, TimeWindow> { private static final long serialVersionUID = 1L; private final long size; private final long offset; private TumblingProcessingTimeWindows(long size, long offset) { if (offset < 0 || offset >= size) { throw new IllegalArgumentException(“TumblingProcessingTimeWindows parameters must satisfy 0 <= offset < size”); } this.size = size; this.offset = offset; } @Override public Collection<TimeWindow> assignWindows(Object element, long timestamp, WindowAssignerContext context) { final long now = context.getCurrentProcessingTime(); long start = TimeWindow.getWindowStartWithOffset(now, offset, size); return Collections.singletonList(new TimeWindow(start, start + size)); } public long getSize() { return size; } @Override public Trigger<Object, TimeWindow> getDefaultTrigger(StreamExecutionEnvironment env) { return ProcessingTimeTrigger.create(); } @Override public String toString() { return “TumblingProcessingTimeWindows(” + size + “)”; } public static TumblingProcessingTimeWindows of(Time size) { return new TumblingProcessingTimeWindows(size.toMilliseconds(), 0); } public static TumblingProcessingTimeWindows of(Time size, Time offset) { return new TumblingProcessingTimeWindows(size.toMilliseconds(), offset.toMilliseconds()); } @Override public TypeSerializer<TimeWindow> getWindowSerializer(ExecutionConfig executionConfig) { return new TimeWindow.Serializer(); } @Override public boolean isEventTime() { return false; }}TumblingProcessingTimeWindows继承了WindowAssigner，其中元素类型为Object，而窗口类型为TimeWindow；它有两个参数，一个是size，一个是offset，其中offset必须大于等于0，size必须大于offsetassignWindows方法获取的窗口为start及start+size，而start=TimeWindow.getWindowStartWithOffset(now, offset, size)，而now值则为context.getCurrentProcessingTime()，则是与TumblingEventTimeWindows的不同之处，TumblingProcessingTimeWindows不使用timestamp参数来计算，它使用now值替代；getDefaultTrigger方法返回的是ProcessingTimeTrigger，而isEventTime方法返回的为falseTumblingProcessingTimeWindows也提供了of静态工厂方法，可以指定size及offset参数小结flink的Tumbling Window分为TumblingEventTimeWindows及TumblingProcessingTimeWindows，它们都继承了WindowAssigner，其中元素类型为Object，而窗口类型为TimeWindow；它有两个参数，一个是size，一个是offset，其中offset必须大于等于0，size必须大于offsetWindowAssigner定义了assignWindows、getDefaultTrigger、getWindowSerializer、isEventTime这几个抽象方法，同时定义了抽象静态类WindowAssignerContext；它有两个泛型，其中T为元素类型，而W为窗口类型；TumblingEventTimeWindows及TumblingProcessingTimeWindows的窗口类型为TimeWindow，它有start及end属性，其中start为inclusive，而end为exclusive，maxTimestamp返回的是end-1，它还提供了mergeWindows及getWindowStartWithOffset静态方法；前者用于合并重叠的时间窗口，后者用于获取指定timestamp、offset、windowSize的window startTumblingEventTimeWindows及TumblingProcessingTimeWindows的不同在于assignWindows、getDefaultTrigger、isEventTime方法；前者assignWindows使用的是参数中的timestamp，而后者使用的是now值；前者的getDefaultTrigger返回的是EventTimeTrigger，而后者返回的是ProcessingTimeTrigger；前者isEventTime方法返回的为true，而后者返回的为falsedocTumbling Windows ...

聊聊flink的window操作

序本文主要研究一下flink的window操作windowDataStreamflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java public AllWindowedStream<T, TimeWindow> timeWindowAll(Time size) { if (environment.getStreamTimeCharacteristic() == TimeCharacteristic.ProcessingTime) { return windowAll(TumblingProcessingTimeWindows.of(size)); } else { return windowAll(TumblingEventTimeWindows.of(size)); } } public AllWindowedStream<T, TimeWindow> timeWindowAll(Time size, Time slide) { if (environment.getStreamTimeCharacteristic() == TimeCharacteristic.ProcessingTime) { return windowAll(SlidingProcessingTimeWindows.of(size, slide)); } else { return windowAll(SlidingEventTimeWindows.of(size, slide)); } } public AllWindowedStream<T, GlobalWindow> countWindowAll(long size) { return windowAll(GlobalWindows.create()).trigger(PurgingTrigger.of(CountTrigger.of(size))); } public AllWindowedStream<T, GlobalWindow> countWindowAll(long size, long slide) { return windowAll(GlobalWindows.create()) .evictor(CountEvictor.of(size)) .trigger(CountTrigger.of(slide)); } @PublicEvolving public <W extends Window> AllWindowedStream<T, W> windowAll(WindowAssigner<? super T, W> assigner) { return new AllWindowedStream<>(this, assigner); }对于非KeyedStream，有timeWindowAll、countWindowAll、windowAll操作，其中最主要的是windowAll操作，它的parallelism为1，它需要一个WindowAssigner参数，返回的是AllWindowedStreamKeyedStreamflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/KeyedStream.java public WindowedStream<T, KEY, TimeWindow> timeWindow(Time size) { if (environment.getStreamTimeCharacteristic() == TimeCharacteristic.ProcessingTime) { return window(TumblingProcessingTimeWindows.of(size)); } else { return window(TumblingEventTimeWindows.of(size)); } } public WindowedStream<T, KEY, TimeWindow> timeWindow(Time size, Time slide) { if (environment.getStreamTimeCharacteristic() == TimeCharacteristic.ProcessingTime) { return window(SlidingProcessingTimeWindows.of(size, slide)); } else { return window(SlidingEventTimeWindows.of(size, slide)); } } public WindowedStream<T, KEY, GlobalWindow> countWindow(long size) { return window(GlobalWindows.create()).trigger(PurgingTrigger.of(CountTrigger.of(size))); } public WindowedStream<T, KEY, GlobalWindow> countWindow(long size, long slide) { return window(GlobalWindows.create()) .evictor(CountEvictor.of(size)) .trigger(CountTrigger.of(slide)); } @PublicEvolving public <W extends Window> WindowedStream<T, KEY, W> window(WindowAssigner<? super T, W> assigner) { return new WindowedStream<>(this, assigner); }对于KeyedStream除了继承了DataStream的window相关操作，它主要用的是timeWindow、countWindow、window操作，其中最主要的是window操作，它也需要一个WindowAssigner参数，返回的是WindowedStreamWindowedStreamflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/WindowedStream.java@Publicpublic class WindowedStream<T, K, W extends Window> { /** The keyed data stream that is windowed by this stream. / private final KeyedStream<T, K> input; /* The window assigner. / private final WindowAssigner<? super T, W> windowAssigner; /* The trigger that is used for window evaluation/emission. / private Trigger<? super T, ? super W> trigger; /* The evictor that is used for evicting elements before window evaluation. / private Evictor<? super T, ? super W> evictor; /* The user-specified allowed lateness. / private long allowedLateness = 0L; /* * Side output {@code OutputTag} for late data. If no tag is set late data will simply be * dropped. */ private OutputTag<T> lateDataOutputTag; @PublicEvolving public WindowedStream(KeyedStream<T, K> input, WindowAssigner<? super T, W> windowAssigner) { this.input = input; this.windowAssigner = windowAssigner; this.trigger = windowAssigner.getDefaultTrigger(input.getExecutionEnvironment()); } @PublicEvolving public WindowedStream<T, K, W> trigger(Trigger<? super T, ? super W> trigger) { if (windowAssigner instanceof MergingWindowAssigner && !trigger.canMerge()) { throw new UnsupportedOperationException(“A merging window assigner cannot be used with a trigger that does not support merging.”); } if (windowAssigner instanceof BaseAlignedWindowAssigner) { throw new UnsupportedOperationException(“Cannot use a " + windowAssigner.getClass().getSimpleName() + " with a custom trigger.”); } this.trigger = trigger; return this; } @PublicEvolving public WindowedStream<T, K, W> allowedLateness(Time lateness) { final long millis = lateness.toMilliseconds(); checkArgument(millis >= 0, “The allowed lateness cannot be negative.”); this.allowedLateness = millis; return this; } @PublicEvolving public WindowedStream<T, K, W> sideOutputLateData(OutputTag<T> outputTag) { Preconditions.checkNotNull(outputTag, “Side output tag must not be null.”); this.lateDataOutputTag = input.getExecutionEnvironment().clean(outputTag); return this; } @PublicEvolving public WindowedStream<T, K, W> evictor(Evictor<? super T, ? super W> evictor) { if (windowAssigner instanceof BaseAlignedWindowAssigner) { throw new UnsupportedOperationException(“Cannot use a " + windowAssigner.getClass().getSimpleName() + " with an Evictor.”); } this.evictor = evictor; return this; } // ———————————————————————— // Operations on the keyed windows // ———————————————————————— //……}WindowedStream有几个参数，其中构造器要求的是input及windowAssigner参数，然后还有Trigger、Evictor、allowedLateness、OutputTag这几个可选参数；另外还必须设置operation function，主要有ReduceFunction、AggregateFunction、FoldFunction(废弃)、ProcessWindowFunction这几个windowAssigner主要用来决定元素如何划分到window中，这里主要有TumblingEventTimeWindows/TumblingProcessingTimeWindows、SlidingEventTimeWindows/SlidingProcessingTimeWindows、EventTimeSessionWindows/ProcessingTimeSessionWindows、GlobalWindows这几个Trigger用来触发window的发射，Evictor用来在发射window的时候剔除元素，allowedLateness用于指定允许元素落后于watermark的最大时间，超出则被丢弃(仅仅对于event-time window有效)，OutputTag用于将late数据输出到side output，可以通过SingleOutputStreamOperator.getSideOutput(OutputTag)方法来获取AllWindowedStream的属性/操作基本跟WindowedStream类似，这里就不详细展开小结window操作是处理无限数据流的核心，它将数据流分割为有限大小的buckets，然后就可以在这些有限数据上进行相关的操作。flink的window操作主要分为两大类，一类是针对KeyedStream的window操作，一个是针对non-key stream的windowAll操作window操作主要有几个参数，WindowAssigner是必不可少的参数，主要有TumblingEventTimeWindows/TumblingProcessingTimeWindows、SlidingEventTimeWindows/SlidingProcessingTimeWindows、EventTimeSessionWindows/ProcessingTimeSessionWindows、GlobalWindows这几个；另外还必须设置operation function，主要有ReduceFunction、AggregateFunction、FoldFunction(废弃)、ProcessWindowFunction这几个Trigger、Evictor、allowedLateness、OutputTag这几个为可选参数，Trigger用来触发window的发射，Evictor用来在发射window的时候剔除元素，allowedLateness用于指定允许元素落后于watermark的最大时间，超出则被丢弃(仅仅对于event-time window有效)，OutputTag用于将late数据输出到side output，可以通过SingleOutputStreamOperator.getSideOutput(OutputTag)方法来获取docWindows ...

聊聊flink KeyedStream的aggregation操作

序本文主要研究一下flink KeyedStream的aggregation操作实例 @Test public void testMax() throws Exception { final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); WordCount[] data = new WordCount[]{new WordCount(1,“Hello”, 1), new WordCount(1,“World”, 3), new WordCount(2,“Hello”, 1)}; env.fromElements(data) .keyBy(“word”) .max(“frequency”) .addSink(new SinkFunction<WordCount>() { @Override public void invoke(WordCount value, Context context) throws Exception { LOGGER.info(“value:{}",value); } }); env.execute(“testMax”); }这里先对word字段进行keyBy操作，然后再通过KeyedStream的max方法按frequency字段取最大的WordCountKeyedStream.aggregateflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/KeyedStream.java public SingleOutputStreamOperator<T> sum(int positionToSum) { return aggregate(new SumAggregator<>(positionToSum, getType(), getExecutionConfig())); } public SingleOutputStreamOperator<T> sum(String field) { return aggregate(new SumAggregator<>(field, getType(), getExecutionConfig())); } public SingleOutputStreamOperator<T> max(int positionToMax) { return aggregate(new ComparableAggregator<>(positionToMax, getType(), AggregationFunction.AggregationType.MAX, getExecutionConfig())); } public SingleOutputStreamOperator<T> max(String field) { return aggregate(new ComparableAggregator<>(field, getType(), AggregationFunction.AggregationType.MAX, false, getExecutionConfig())); } public SingleOutputStreamOperator<T> min(int positionToMin) { return aggregate(new ComparableAggregator<>(positionToMin, getType(), AggregationFunction.AggregationType.MIN, getExecutionConfig())); } public SingleOutputStreamOperator<T> min(String field) { return aggregate(new ComparableAggregator<>(field, getType(), AggregationFunction.AggregationType.MIN, false, getExecutionConfig())); } public SingleOutputStreamOperator<T> maxBy(int positionToMaxBy) { return this.maxBy(positionToMaxBy, true); } public SingleOutputStreamOperator<T> maxBy(String positionToMaxBy) { return this.maxBy(positionToMaxBy, true); } public SingleOutputStreamOperator<T> maxBy(int positionToMaxBy, boolean first) { return aggregate(new ComparableAggregator<>(positionToMaxBy, getType(), AggregationFunction.AggregationType.MAXBY, first, getExecutionConfig())); } public SingleOutputStreamOperator<T> maxBy(String field, boolean first) { return aggregate(new ComparableAggregator<>(field, getType(), AggregationFunction.AggregationType.MAXBY, first, getExecutionConfig())); } public SingleOutputStreamOperator<T> minBy(int positionToMinBy) { return this.minBy(positionToMinBy, true); } public SingleOutputStreamOperator<T> minBy(String positionToMinBy) { return this.minBy(positionToMinBy, true); } public SingleOutputStreamOperator<T> minBy(int positionToMinBy, boolean first) { return aggregate(new ComparableAggregator<T>(positionToMinBy, getType(), AggregationFunction.AggregationType.MINBY, first, getExecutionConfig())); } public SingleOutputStreamOperator<T> minBy(String field, boolean first) { return aggregate(new ComparableAggregator(field, getType(), AggregationFunction.AggregationType.MINBY, first, getExecutionConfig())); } protected SingleOutputStreamOperator<T> aggregate(AggregationFunction<T> aggregate) { StreamGroupedReduce<T> operator = new StreamGroupedReduce<T>( clean(aggregate), getType().createSerializer(getExecutionConfig())); return transform(“Keyed Aggregation”, getType(), operator); }KeyedStream的aggregation方法是protected修饰的，sum、max、min、maxBy、minBy这几个方法实际都是调用了aggregate方法，只是它们创建的ComparableAggregator的AggregationType不一样每个sum、max、min、maxBy、minBy都有两个重载方法，一个是int类型的参数，一个是String类型的参数maxBy、minBy比sum、max、min多了boolean参数，该参数用于指定在碰到多个compare值相等时，是否取第一个返回ComparableAggregatorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/aggregation/ComparableAggregator.java@Internalpublic class ComparableAggregator<T> extends AggregationFunction<T> { private static final long serialVersionUID = 1L; private Comparator comparator; private boolean byAggregate; private boolean first; private final FieldAccessor<T, Object> fieldAccessor; private ComparableAggregator(AggregationType aggregationType, FieldAccessor<T, Object> fieldAccessor, boolean first) { this.comparator = Comparator.getForAggregation(aggregationType); this.byAggregate = (aggregationType == AggregationType.MAXBY) || (aggregationType == AggregationType.MINBY); this.first = first; this.fieldAccessor = fieldAccessor; } public ComparableAggregator(int positionToAggregate, TypeInformation<T> typeInfo, AggregationType aggregationType, ExecutionConfig config) { this(positionToAggregate, typeInfo, aggregationType, false, config); } public ComparableAggregator(int positionToAggregate, TypeInformation<T> typeInfo, AggregationType aggregationType, boolean first, ExecutionConfig config) { this(aggregationType, FieldAccessorFactory.getAccessor(typeInfo, positionToAggregate, config), first); } public ComparableAggregator(String field, TypeInformation<T> typeInfo, AggregationType aggregationType, boolean first, ExecutionConfig config) { this(aggregationType, FieldAccessorFactory.getAccessor(typeInfo, field, config), first); } @SuppressWarnings(“unchecked”) @Override public T reduce(T value1, T value2) throws Exception { Comparable<Object> o1 = (Comparable<Object>) fieldAccessor.get(value1); Object o2 = fieldAccessor.get(value2); int c = comparator.isExtremal(o1, o2); if (byAggregate) { // if they are the same we choose based on whether we want to first or last // element with the min/max. if (c == 0) { return first ? value1 : value2; } return c == 1 ? value1 : value2; } else { if (c == 0) { value1 = fieldAccessor.set(value1, o2); } return value1; } }}ComparableAggregator继承了AggregationFunction，而AggregationFunction则实现了ReduceFunction接口，这里ComparableAggregator实现的reduce方法，它首先借助Comparator来比较两个对象，然后根据是否是byAggregate做不同处理，如果是byAggregate，则在比较值为0时，判断是否返回最先遇到的元素，如果是则返回最先遇到的，否则返回最后遇到的，比较值非0时，则取比较值最大的元素返回；如果不是byAggregate，则如果比较值为0，则使用反射方法将后者的值更新到value1，最后都是返回value1AggregationFunction@Internalpublic abstract class AggregationFunction<T> implements ReduceFunction<T> { private static final long serialVersionUID = 1L; /** * Aggregation types that can be used on a windowed stream or keyed stream. */ public enum AggregationType { SUM, MIN, MAX, MINBY, MAXBY, }}AggregationFunction声明实现了ReduceFunction，同时定义了五种类型的AggregationTypeComparatorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/aggregation/Comparator.java@Internalpublic abstract class Comparator implements Serializable { private static final long serialVersionUID = 1L; public abstract <R> int isExtremal(Comparable<R> o1, R o2); public static Comparator getForAggregation(AggregationType type) { switch (type) { case MAX: return new MaxComparator(); case MIN: return new MinComparator(); case MINBY: return new MinByComparator(); case MAXBY: return new MaxByComparator(); default: throw new IllegalArgumentException(“Unsupported aggregation type.”); } } private static class MaxComparator extends Comparator { private static final long serialVersionUID = 1L; @Override public <R> int isExtremal(Comparable<R> o1, R o2) { return o1.compareTo(o2) > 0 ? 1 : 0; } } private static class MaxByComparator extends Comparator { private static final long serialVersionUID = 1L; @Override public <R> int isExtremal(Comparable<R> o1, R o2) { int c = o1.compareTo(o2); if (c > 0) { return 1; } if (c == 0) { return 0; } else { return -1; } } } private static class MinByComparator extends Comparator { private static final long serialVersionUID = 1L; @Override public <R> int isExtremal(Comparable<R> o1, R o2) { int c = o1.compareTo(o2); if (c < 0) { return 1; } if (c == 0) { return 0; } else { return -1; } } } private static class MinComparator extends Comparator { private static final long serialVersionUID = 1L; @Override public <R> int isExtremal(Comparable<R> o1, R o2) { return o1.compareTo(o2) < 0 ? 1 : 0; } }}Comparator则实现Serializable接口，定义了isExtremal抽象方法，同时提供了getForAggregation工厂方法，根据不同的AggregationType创建不同的ComparatorComparator里头定义了MaxComparator、MinComparator、MinByComparator、MaxByComparator四个子类，它们都实现了isExtremal方法MaxComparator直接利用Comparable接口定义的compareTo方法，不过它的返回只有0和1，compareTo大于0的时候才返回1，否则返回0，也就是大于的情况才返回1，否则返回0；MaxByComparator也先根据Comparable接口定义的compareTo方法获取值，不过它的返回值有3种，大于0的时候返回1，等于0时返回0，小于0时返回-1，也就是大于的情况返回1，相等的情况返回0，小于的情况返回-1小结KeyedStream的aggregation操作主要分为sum、max、min、maxBy、minBy这几个方法，它们内部都调用了protected修饰的aggregation方法，只是它们创建的ComparableAggregator的AggregationType不一样ComparableAggregator继承了AggregationFunction，而AggregationFunction则实现了ReduceFunction接口，这里ComparableAggregator实现的reduce方法，它首先借助Comparator来比较两个对象，然后根据是否是byAggregate做不同处理，如果是byAggregate，则在比较值为0时，判断是否返回最先遇到的元素，如果是则返回最先遇到的，否则返回最后遇到的，比较值非0时，则取比较值最大的元素返回；如果不是byAggregate，则如果比较值为0，则使用反射方法将后者的值更新到value1，最后都是返回value1Comparator里头定义了MaxComparator、MinComparator、MinByComparator、MaxByComparator四个子类，它们都实现了isExtremal方法；MaxComparator与MaxByComparator的区别在于，MaxComparator大于返回1，小于等于返回0，而MaxByComparator返回值更精细，大于返回1，等于返回0，小于返回-1；这个区别也体现在ComparableAggregator的reduce方法中，而且maxBy、minBy比其他方法多了一个first(boolean)参数，专门用于在比较值为的0的时候选择返回哪个元素；而reduce方法对于非byAggregate操作，始终返回的是value1，在比较值小于等于的时候，使用反射更新value1，然后返回value1docDataStream Transformations ...

聊聊flink KeyedStream的reduce操作

序本文主要研究一下flink KeyedStream的reduce操作实例 @Test public void testWordCount() throws Exception { // Checking input parameters// final ParameterTool params = ParameterTool.fromArgs(args); // set up the execution environment final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // make parameters available in the web interface// env.getConfig().setGlobalJobParameters(params); // get input data DataStream<String> text = env.fromElements(WORDS); DataStream<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1) text.flatMap(new Tokenizer()) // group by the tuple field “0” and sum up tuple field “1” .keyBy(0) .reduce(new ReduceFunction<Tuple2<String, Integer>>() { @Override public Tuple2<String, Integer> reduce(Tuple2<String, Integer> value1, Tuple2<String, Integer> value2) { System.out.println(“value1:"+value1.f1+";value2:"+value2.f1); return new Tuple2<>(value1.f0, value1.f1 + value2.f1); } }); // emit result System.out.println(“Printing result to stdout. Use –output to specify output path.”); counts.print(); // execute program env.execute(“Streaming WordCount”); }这里对KeyedStream进行reduce操作，自定义了ReduceFunction，在reduce方法里头累加word的计数KeyedStream.reduceflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/KeyedStream.java@Publicpublic class KeyedStream<T, KEY> extends DataStream<T> { //…… /** * Applies a reduce transformation on the grouped data stream grouped on by * the given key position. The {@link ReduceFunction} will receive input * values based on the key value. Only input values with the same key will * go to the same reducer. * * @param reducer * The {@link ReduceFunction} that will be called for every * element of the input values with the same key. * @return The transformed DataStream. / public SingleOutputStreamOperator<T> reduce(ReduceFunction<T> reducer) { return transform(“Keyed Reduce”, getType(), new StreamGroupedReduce<T>( clean(reducer), getType().createSerializer(getExecutionConfig()))); } @Override @PublicEvolving public <R> SingleOutputStreamOperator<R> transform(String operatorName, TypeInformation<R> outTypeInfo, OneInputStreamOperator<T, R> operator) { SingleOutputStreamOperator<R> returnStream = super.transform(operatorName, outTypeInfo, operator); // inject the key selector and key type OneInputTransformation<T, R> transform = (OneInputTransformation<T, R>) returnStream.getTransformation(); transform.setStateKeySelector(keySelector); transform.setStateKeyType(keyType); return returnStream; } //……}KeyedStream的reduce方法调用了transform方法，而构造的OneInputStreamOperator为StreamGroupedReduceReduceFunctionflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/functions/ReduceFunction.java@Public@FunctionalInterfacepublic interface ReduceFunction<T> extends Function, Serializable { /* * The core method of ReduceFunction, combining two values into one value of the same type. * The reduce function is consecutively applied to all values of a group until only a single value remains. * * @param value1 The first value to combine. * @param value2 The second value to combine. * @return The combined value of both input values. * * @throws Exception This method may throw exceptions. Throwing an exception will cause the operation * to fail and may trigger recovery. / T reduce(T value1, T value2) throws Exception;}ReduceFunction定义了reduce方法，它主要是用来将两个同类型的值操作为一个同类型的值，第一个参数为前面reduce的结果，第二参数为当前的元素Task.runflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/taskmanager/Task.java/* * The Task represents one execution of a parallel subtask on a TaskManager. * A Task wraps a Flink operator (which may be a user function) and * runs it, providing all services necessary for example to consume input data, * produce its results (intermediate result partitions) and communicate * with the JobManager. * * The Flink operators (implemented as subclasses of * {@link AbstractInvokable} have only data readers, -writers, and certain event callbacks. * The task connects those to the network stack and actor messages, and tracks the state * of the execution and handles exceptions. * * Tasks have no knowledge about how they relate to other tasks, or whether they * are the first attempt to execute the task, or a repeated attempt. All of that * is only known to the JobManager. All the task knows are its own runnable code, * the task’s configuration, and the IDs of the intermediate results to consume and * produce (if any). * * Each Task is run by one dedicated thread. /public class Task implements Runnable, TaskActions, CheckpointListener { //…… /* * The core work method that bootstraps the task and executes its code. / @Override public void run() { // —————————- // Initial State transition // —————————- //…… // all resource acquisitions and registrations from here on // need to be undone in the end Map<String, Future<Path>> distributedCacheEntries = new HashMap<>(); AbstractInvokable invokable = null; try { // now load and instantiate the task’s invokable code invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass, env); // —————————————————————- // actual task core work // —————————————————————- // we must make strictly sure that the invokable is accessible to the cancel() call // by the time we switched to running. this.invokable = invokable; // switch to the RUNNING state, if that fails, we have been canceled/failed in the meantime if (!transitionState(ExecutionState.DEPLOYING, ExecutionState.RUNNING)) { throw new CancelTaskException(); } // notify everyone that we switched to running taskManagerActions.updateTaskExecutionState(new TaskExecutionState(jobId, executionId, ExecutionState.RUNNING)); // make sure the user code classloader is accessible thread-locally executingThread.setContextClassLoader(userCodeClassLoader); // run the invokable invokable.invoke(); //…… } catch (Throwable t) { //…… } finally { //…… } }}Task的run方法会调用invokable.invoke()，这里的invokable为OneInputStreamTask，而OneInputStreamTask继承了StreamTask，这里实际调用的invoke()方法是StreamTask里头的StreamTask.invokeflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/runtime/tasks/StreamTask.java@Internalpublic abstract class StreamTask<OUT, OP extends StreamOperator<OUT>> extends AbstractInvokable implements AsyncExceptionHandler { //…… protected abstract void run() throws Exception; @Override public final void invoke() throws Exception { boolean disposed = false; try { // ——– Initialize ——— LOG.debug(“Initializing {}.”, getName()); asyncOperationsThreadPool = Executors.newCachedThreadPool(); CheckpointExceptionHandlerFactory cpExceptionHandlerFactory = createCheckpointExceptionHandlerFactory(); synchronousCheckpointExceptionHandler = cpExceptionHandlerFactory.createCheckpointExceptionHandler( getExecutionConfig().isFailTaskOnCheckpointError(), getEnvironment()); asynchronousCheckpointExceptionHandler = new AsyncCheckpointExceptionHandler(this); stateBackend = createStateBackend(); checkpointStorage = stateBackend.createCheckpointStorage(getEnvironment().getJobID()); // if the clock is not already set, then assign a default TimeServiceProvider if (timerService == null) { ThreadFactory timerThreadFactory = new DispatcherThreadFactory(TRIGGER_THREAD_GROUP, “Time Trigger for " + getName(), getUserCodeClassLoader()); timerService = new SystemProcessingTimeService(this, getCheckpointLock(), timerThreadFactory); } operatorChain = new OperatorChain<>(this, streamRecordWriters); headOperator = operatorChain.getHeadOperator(); // task specific initialization init(); // save the work of reloading state, etc, if the task is already canceled if (canceled) { throw new CancelTaskException(); } // ——– Invoke ——– LOG.debug(“Invoking {}”, getName()); // we need to make sure that any triggers scheduled in open() cannot be // executed before all operators are opened synchronized (lock) { // both the following operations are protected by the lock // so that we avoid race conditions in the case that initializeState() // registers a timer, that fires before the open() is called. initializeState(); openAllOperators(); } // final check to exit early before starting to run if (canceled) { throw new CancelTaskException(); } // let the task do its work isRunning = true; run(); // if this left the run() method cleanly despite the fact that this was canceled, // make sure the “clean shutdown” is not attempted if (canceled) { throw new CancelTaskException(); } LOG.debug(“Finished task {}”, getName()); //…… } finally { //…… } }}StreamTask的invoke方法会调用run方法，该方法为抽象方法，由子类实现，这里就是OneInputStreamTask的run方法OneInputStreamTask.runflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/runtime/tasks/OneInputStreamTask.java@Internalpublic class OneInputStreamTask<IN, OUT> extends StreamTask<OUT, OneInputStreamOperator<IN, OUT>> { private StreamInputProcessor<IN> inputProcessor; private volatile boolean running = true; private final WatermarkGauge inputWatermarkGauge = new WatermarkGauge(); /* * Constructor for initialization, possibly with initial state (recovery / savepoint / etc). * * @param env The task environment for this task. / public OneInputStreamTask(Environment env) { super(env); } /* * Constructor for initialization, possibly with initial state (recovery / savepoint / etc). * * This constructor accepts a special {@link ProcessingTimeService}. By default (and if * null is passes for the time provider) a {@link SystemProcessingTimeService DefaultTimerService} * will be used. * * @param env The task environment for this task. * @param timeProvider Optionally, a specific time provider to use. / @VisibleForTesting public OneInputStreamTask( Environment env, @Nullable ProcessingTimeService timeProvider) { super(env, timeProvider); } @Override public void init() throws Exception { StreamConfig configuration = getConfiguration(); TypeSerializer<IN> inSerializer = configuration.getTypeSerializerIn1(getUserCodeClassLoader()); int numberOfInputs = configuration.getNumberOfInputs(); if (numberOfInputs > 0) { InputGate[] inputGates = getEnvironment().getAllInputGates(); inputProcessor = new StreamInputProcessor<>( inputGates, inSerializer, this, configuration.getCheckpointMode(), getCheckpointLock(), getEnvironment().getIOManager(), getEnvironment().getTaskManagerInfo().getConfiguration(), getStreamStatusMaintainer(), this.headOperator, getEnvironment().getMetricGroup().getIOMetricGroup(), inputWatermarkGauge); } headOperator.getMetricGroup().gauge(MetricNames.IO_CURRENT_INPUT_WATERMARK, this.inputWatermarkGauge); // wrap watermark gauge since registered metrics must be unique getEnvironment().getMetricGroup().gauge(MetricNames.IO_CURRENT_INPUT_WATERMARK, this.inputWatermarkGauge::getValue); } @Override protected void run() throws Exception { // cache processor reference on the stack, to make the code more JIT friendly final StreamInputProcessor<IN> inputProcessor = this.inputProcessor; while (running && inputProcessor.processInput()) { // all the work happens in the “processInput” method } } @Override protected void cleanup() throws Exception { if (inputProcessor != null) { inputProcessor.cleanup(); } } @Override protected void cancelTask() { running = false; }}OneInputStreamTask的run方法会不断循环调用inputProcessor.processInput()，inputProcessor这里为StreamInputProcessorStreamInputProcessor.processInputflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/runtime/io/StreamInputProcessor.java@Internalpublic class StreamInputProcessor<IN> { //…… public boolean processInput() throws Exception { if (isFinished) { return false; } if (numRecordsIn == null) { try { numRecordsIn = ((OperatorMetricGroup) streamOperator.getMetricGroup()).getIOMetricGroup().getNumRecordsInCounter(); } catch (Exception e) { LOG.warn(“An exception occurred during the metrics setup.”, e); numRecordsIn = new SimpleCounter(); } } while (true) { if (currentRecordDeserializer != null) { DeserializationResult result = currentRecordDeserializer.getNextRecord(deserializationDelegate); if (result.isBufferConsumed()) { currentRecordDeserializer.getCurrentBuffer().recycleBuffer(); currentRecordDeserializer = null; } if (result.isFullRecord()) { StreamElement recordOrMark = deserializationDelegate.getInstance(); if (recordOrMark.isWatermark()) { // handle watermark statusWatermarkValve.inputWatermark(recordOrMark.asWatermark(), currentChannel); continue; } else if (recordOrMark.isStreamStatus()) { // handle stream status statusWatermarkValve.inputStreamStatus(recordOrMark.asStreamStatus(), currentChannel); continue; } else if (recordOrMark.isLatencyMarker()) { // handle latency marker synchronized (lock) { streamOperator.processLatencyMarker(recordOrMark.asLatencyMarker()); } continue; } else { // now we can do the actual processing StreamRecord<IN> record = recordOrMark.asRecord(); synchronized (lock) { numRecordsIn.inc(); streamOperator.setKeyContextElement1(record); streamOperator.processElement(record); } return true; } } } //…… } } //……}StreamInputProcessor的processInput方法，会在while true循环里头不断处理nextRecord，这里根据StreamElement的不同类型做不同处理，如果是普通的数据，则调用streamOperator.processElement进行处理，这里的streamOperator为StreamGroupedReduceStreamGroupedReduce.processElementflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/operators/StreamGroupedReduce.java/* * A {@link StreamOperator} for executing a {@link ReduceFunction} on a * {@link org.apache.flink.streaming.api.datastream.KeyedStream}. */@Internalpublic class StreamGroupedReduce<IN> extends AbstractUdfStreamOperator<IN, ReduceFunction<IN>> implements OneInputStreamOperator<IN, IN> { private static final long serialVersionUID = 1L; private static final String STATE_NAME = “_op_state”; private transient ValueState<IN> values; private TypeSerializer<IN> serializer; public StreamGroupedReduce(ReduceFunction<IN> reducer, TypeSerializer<IN> serializer) { super(reducer); this.serializer = serializer; } @Override public void open() throws Exception { super.open(); ValueStateDescriptor<IN> stateId = new ValueStateDescriptor<>(STATE_NAME, serializer); values = getPartitionedState(stateId); } @Override public void processElement(StreamRecord<IN> element) throws Exception { IN value = element.getValue(); IN currentValue = values.value(); if (currentValue != null) { IN reduced = userFunction.reduce(currentValue, value); values.update(reduced); output.collect(element.replace(reduced)); } else { values.update(value); output.collect(element.replace(value)); } }}StreamGroupedReduce使用了ValueState存储reduce操作的结果值，在processElement方法里头调用userFunction的reduce操作，userFunction就是用户自定义的ReduceFunction，而reduce的第一个参数就是ValueState的value，即上一次reduce操作的结果值，然后第二个参数就当前element的value；而在执行完userFunction的reduce操作之后，会将该结果update到ValueState小结KeyedStream的reduce方法，里头调用了transform方法，而构造的OneInputStreamOperator为StreamGroupedReduce；reduce方法接收的是ReduceFunction，它定义了reduce方法，用来将两个同类型的值操作为一个同类型的值Task的run方法会调用invokable.invoke()，这里的invokable为OneInputStreamTask，而OneInputStreamTask继承了StreamTask，这里实际调用的invoke()方法是StreamTask里头的；StreamTask的invoke方法会调用run方法，该方法为抽象方法，由子类实现，这里就是OneInputStreamTask的run方法；OneInputStreamTask的run方法，会不断循环调用inputProcessor.processInput()，inputProcessor这里为StreamInputProcessor；StreamInputProcessor的processInput方法，会在while true循环里头不断处理nextRecord，这里根据StreamElement的不同类型做不同处理，如果是普通的数据，则调用streamOperator.processElement进行处理，这里的streamOperator为StreamGroupedReduceStreamGroupedReduce的processElement方法会调用userFunction的reduce操作，第一个参数就是ValueState的value，即上一次reduce操作的结果值，然后第二个参数就当前element的value；而在执行完userFunction的reduce操作之后，会将该结果update到ValueStatedocdatastream-transformations ...

聊聊flink KeyedStream的KeySelector

序本文主要研究一下flink KeyedStream的KeySelectorKeyedStreamflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/KeyedStream.java@Publicpublic class KeyedStream<T, KEY> extends DataStream<T> { /** * The key selector that can get the key by which the stream if partitioned from the elements. / private final KeySelector<T, KEY> keySelector; /* The type of the key by which the stream is partitioned. / private final TypeInformation<KEY> keyType; /* * Creates a new {@link KeyedStream} using the given {@link KeySelector} * to partition operator state by key. * * @param dataStream * Base stream of data * @param keySelector * Function for determining state partitions / public KeyedStream(DataStream<T> dataStream, KeySelector<T, KEY> keySelector) { this(dataStream, keySelector, TypeExtractor.getKeySelectorTypes(keySelector, dataStream.getType())); } /* * Creates a new {@link KeyedStream} using the given {@link KeySelector} * to partition operator state by key. * * @param dataStream * Base stream of data * @param keySelector * Function for determining state partitions / public KeyedStream(DataStream<T> dataStream, KeySelector<T, KEY> keySelector, TypeInformation<KEY> keyType) { this( dataStream, new PartitionTransformation<>( dataStream.getTransformation(), new KeyGroupStreamPartitioner<>(keySelector, StreamGraphGenerator.DEFAULT_LOWER_BOUND_MAX_PARALLELISM)), keySelector, keyType); } /* * Creates a new {@link KeyedStream} using the given {@link KeySelector} and {@link TypeInformation} * to partition operator state by key, where the partitioning is defined by a {@link PartitionTransformation}. * * @param stream * Base stream of data * @param partitionTransformation * Function that determines how the keys are distributed to downstream operator(s) * @param keySelector * Function to extract keys from the base stream * @param keyType * Defines the type of the extracted keys / @Internal KeyedStream( DataStream<T> stream, PartitionTransformation<T> partitionTransformation, KeySelector<T, KEY> keySelector, TypeInformation<KEY> keyType) { super(stream.getExecutionEnvironment(), partitionTransformation); this.keySelector = clean(keySelector); this.keyType = validateKeyType(keyType); } //……}这里可以看到KeyedStream的不同构造器中都需要一个KeySelector类型的参数KeySelectorflink-core-1.7.0-sources.jar!/org/apache/flink/api/java/functions/KeySelector.java@Public@FunctionalInterfacepublic interface KeySelector<IN, KEY> extends Function, Serializable { /* * User-defined function that deterministically extracts the key from an object. * * For example for a class: * <pre> * public class Word { * String word; * int count; * } * </pre> * The key extractor could return the word as * a key to group all Word objects by the String they contain. * * The code would look like this * <pre> * public String getKey(Word w) { * return w.word; * } * </pre> * * @param value The object to get the key from. * @return The extracted key. * * @throws Exception Throwing an exception will cause the execution of the respective task to fail, * and trigger recovery or cancellation of the program. / KEY getKey(IN value) throws Exception;}KeySelector接口继承了Function接口，定义了getKey方法，用于从IN类型中提取出KEYDataStream.keyByflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java /* * It creates a new {@link KeyedStream} that uses the provided key for partitioning * its operator states. * * @param key * The KeySelector to be used for extracting the key for partitioning * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) / public <K> KeyedStream<T, K> keyBy(KeySelector<T, K> key) { Preconditions.checkNotNull(key); return new KeyedStream<>(this, clean(key)); } /* * It creates a new {@link KeyedStream} that uses the provided key with explicit type information * for partitioning its operator states. * * @param key The KeySelector to be used for extracting the key for partitioning. * @param keyType The type information describing the key type. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) / public <K> KeyedStream<T, K> keyBy(KeySelector<T, K> key, TypeInformation<K> keyType) { Preconditions.checkNotNull(key); Preconditions.checkNotNull(keyType); return new KeyedStream<>(this, clean(key), keyType); } /* * Partitions the operator state of a {@link DataStream} by the given key positions. * * @param fields * The position of the fields on which the {@link DataStream} * will be grouped. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) / public KeyedStream<T, Tuple> keyBy(int… fields) { if (getType() instanceof BasicArrayTypeInfo || getType() instanceof PrimitiveArrayTypeInfo) { return keyBy(KeySelectorUtil.getSelectorForArray(fields, getType())); } else { return keyBy(new Keys.ExpressionKeys<>(fields, getType())); } } /* * Partitions the operator state of a {@link DataStream} using field expressions. * A field expression is either the name of a public field or a getter method with parentheses * of the {@link DataStream}’s underlying type. A dot can be used to drill * down into objects, as in {@code “field1.getInnerField2()” }. * * @param fields * One or more field expressions on which the state of the {@link DataStream} operators will be * partitioned. * @return The {@link DataStream} with partitioned state (i.e. KeyedStream) / public KeyedStream<T, Tuple> keyBy(String… fields) { return keyBy(new Keys.ExpressionKeys<>(fields, getType())); } private KeyedStream<T, Tuple> keyBy(Keys<T> keys) { return new KeyedStream<>(this, clean(KeySelectorUtil.getSelectorForKeys(keys, getType(), getExecutionConfig()))); }DataStream的keyBy方法用于将DataStream转换为KeyedStream，该方法有不同的重载一个是支持变长int数组，这个通常用于简单tuple类型，int为tuple的小标，从0开始，如果是多个int，表示是组合key，比如keyBy(0,1)表示要用tuple的第一个和第二个字段作为key；一个是支持变长String数组，这个通常用于复杂tuple类型及POJO类型，对于POJO，String用于指定字段名，也支持对象/tuple嵌套属性，比如user.zip，对于对象类型的tuple，f0表示该tuple的第一个字段一个是支持KeySelector，通过Key Selector Function可以自由指定key，比如从对象提取然后做些处理keyBy(int… fields)及keyBy(String… fields)里头均有调用到私有的keyBy(Keys<T> keys)方法，由于KeyedStream的构造器都需要KeySelector参数，所以该方法最后也是通过KeySelectorUtil.getSelectorForKeys将Keys转换为KeySelector对象Keys.ExpressionKeysflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/operators/Keys.java / * Represents (nested) field access through string and integer-based keys / public static class ExpressionKeys<T> extends Keys<T> { public static final String SELECT_ALL_CHAR = “”; public static final String SELECT_ALL_CHAR_SCALA = “_”; private static final Pattern WILD_CARD_REGEX = Pattern.compile("[\.]?(" + “\” + SELECT_ALL_CHAR + “|” + “\” + SELECT_ALL_CHAR_SCALA +")$"); // Flattened fields representing keys fields private List<FlatFieldDescriptor> keyFields; private TypeInformation<?>[] originalKeyTypes; //…… /** * Create String-based (nested) field expression keys on a composite type. */ public ExpressionKeys(String[] keyExpressions, TypeInformation<T> type) { checkNotNull(keyExpressions, “Field expression cannot be null.”); this.keyFields = new ArrayList<>(keyExpressions.length); if (type instanceof CompositeType){ CompositeType<T> cType = (CompositeType<T>) type; this.originalKeyTypes = new TypeInformation<?>[keyExpressions.length]; // extract the keys on their flat position for (int i = 0; i < keyExpressions.length; i++) { String keyExpr = keyExpressions[i]; if (keyExpr == null) { throw new InvalidProgramException(“Expression key may not be null.”); } // strip off whitespace keyExpr = keyExpr.trim(); List<FlatFieldDescriptor> flatFields = cType.getFlatFields(keyExpr); if (flatFields.size() == 0) { throw new InvalidProgramException(“Unable to extract key from expression ‘” + keyExpr + “’ on key " + cType); } // check if all nested fields can be used as keys for (FlatFieldDescriptor field : flatFields) { if (!field.getType().isKeyType()) { throw new InvalidProgramException(“This type (” + field.getType() + “) cannot be used as key.”); } } // add flat fields to key fields keyFields.addAll(flatFields); String strippedKeyExpr = WILD_CARD_REGEX.matcher(keyExpr).replaceAll(”"); if (strippedKeyExpr.isEmpty()) { this.originalKeyTypes[i] = type; } else { this.originalKeyTypes[i] = cType.getTypeAt(strippedKeyExpr); } } } else { if (!type.isKeyType()) { throw new InvalidProgramException(“This type (” + type + “) cannot be used as key.”); } // check that all key expressions are valid for (String keyExpr : keyExpressions) { if (keyExpr == null) { throw new InvalidProgramException(“Expression key may not be null.”); } // strip off whitespace keyExpr = keyExpr.trim(); // check that full type is addressed if (!(SELECT_ALL_CHAR.equals(keyExpr) || SELECT_ALL_CHAR_SCALA.equals(keyExpr))) { throw new InvalidProgramException( “Field expression must be equal to ‘” + SELECT_ALL_CHAR + “’ or ‘” + SELECT_ALL_CHAR_SCALA + “’ for non-composite types.”); } // add full type as key keyFields.add(new FlatFieldDescriptor(0, type)); } this.originalKeyTypes = new TypeInformation[] {type}; } } //…… }ExpressionKeys是Keys里头的一个静态类，它继承了Keys对象；keyBy(int… fields)及keyBy(String… fields)里头均有通过new Keys.ExpressionKeys，将fields转换为Keys.ExpressionKeys，最后调用私有的keyBy(Keys<T> keys)方法KeySelectorUtil.getSelectorForKeysflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/util/keys/KeySelectorUtil.java@Internalpublic final class KeySelectorUtil { public static <X> KeySelector<X, Tuple> getSelectorForKeys(Keys<X> keys, TypeInformation<X> typeInfo, ExecutionConfig executionConfig) { if (!(typeInfo instanceof CompositeType)) { throw new InvalidTypesException( “This key operation requires a composite type such as Tuples, POJOs, or Case Classes.”); } CompositeType<X> compositeType = (CompositeType<X>) typeInfo; int[] logicalKeyPositions = keys.computeLogicalKeyPositions(); int numKeyFields = logicalKeyPositions.length; TypeInformation<?>[] typeInfos = keys.getKeyFieldTypes(); // use ascending order here, the code paths for that are usually a slight bit faster boolean[] orders = new boolean[numKeyFields]; for (int i = 0; i < numKeyFields; i++) { orders[i] = true; } TypeComparator<X> comparator = compositeType.createComparator(logicalKeyPositions, orders, 0, executionConfig); return new ComparableKeySelector<>(comparator, numKeyFields, new TupleTypeInfo<>(typeInfos)); } //……}KeySelectorUtil.getSelectorForKeys方法用于将Keys转换为KeySelector类型小结KeyedStream的不同构造器中都需要一个KeySelector参数DataStream的keyBy方法有不同的重载，支持变长int数组，变长String数组以及KeySelector类型keyBy(int… fields)及keyBy(String… fields)里头均有通过new Keys.ExpressionKeys，将fields转换为Keys.ExpressionKeys，最后调用私有的keyBy(Keys<T> keys)方法，该方法通过调用KeySelectorUtil.getSelectorForKeys方法将Keys转换为KeySelector类型docSpecifying Keys ...

聊聊flink的Queryable State

序本文主要研究一下flink的Queryable State实例Job @Test public void testValueStateForQuery() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment .createRemoteEnvironment(“192.168.99.100”, 8081, SubmitTest.JAR_FILE); env.addSource(new RandomTuple2Source()) .keyBy(0) //key by first value of tuple .flatMap(new CountWindowAverage()) .print(); JobExecutionResult result = env.execute(“testQueryableState”); LOGGER.info(“submit job result:{}",result); }这里运行一个job，它对tuple的第一个值作为key，然后flatMap操作使用的是CountWindowAverageCountWindowAveragepublic class CountWindowAverage extends RichFlatMapFunction<Tuple2<Long, Long>, Tuple2<Long, Long>> { private transient ValueState<Tuple2<Long, Long>> sum; // a tuple containing the count and the sum @Override public void flatMap(Tuple2<Long, Long> input, Collector<Tuple2<Long, Long>> out) throws Exception { Tuple2<Long, Long> currentSum = sum.value(); if(currentSum == null){ currentSum = Tuple2.of(1L,input.f1); }else{ currentSum.f0 += 1; currentSum.f1 += input.f1; } sum.update(currentSum); if (currentSum.f0 >= 2) { out.collect(new Tuple2<>(input.f0, currentSum.f1 / currentSum.f0)); sum.clear(); } } @Override public void open(Configuration config) { ValueStateDescriptor<Tuple2<Long, Long>> descriptor = new ValueStateDescriptor<>( “average”, // the state name TypeInformation.of(new TypeHint<Tuple2<Long, Long>>() {})); // type information descriptor.setQueryable(“query-name”); sum = getRuntimeContext().getState(descriptor); }}CountWindowAverage通过ValueStateDescriptor的setQueryable(“query-name”)方法，将state声明为是queryable的QueryableStateClient @Test public void testQueryStateByJobId() throws InterruptedException, IOException { //get jobId from flink ui running job page JobID jobId = JobID.fromHexString(“793edfa93f354aa0274f759cb13ce79e”); long key = 1L; //flink-core-1.7.0-sources.jar!/org/apache/flink/configuration/QueryableStateOptions.java QueryableStateClient client = new QueryableStateClient(“192.168.99.100”, 9069); // the state descriptor of the state to be fetched. ValueStateDescriptor<Tuple2<Long, Long>> descriptor = new ValueStateDescriptor<>( “average”, TypeInformation.of(new TypeHint<Tuple2<Long, Long>>() {})); CompletableFuture<ValueState<Tuple2<Long, Long>>> resultFuture = client.getKvState(jobId, “query-name”, key, BasicTypeInfo.LONG_TYPE_INFO, descriptor); LOGGER.info(“get kv state return future, waiting……”); // org.apache.flink.queryablestate.exceptions.UnknownKeyOrNamespaceException: Queryable State Server : No state for the specified key/namespace. ValueState<Tuple2<Long, Long>> res = resultFuture.join(); LOGGER.info(“query result:{}",res.value()); client.shutdownAndWait(); }这里通过QueryableStateClient连接QueryableStateClientProxy进行query state；这里的jobId可以在job提交之后，通过ui界面查询得到，然后使用JobID.fromHexString方法转为JobID对象小结Queryable State的功能目前是beta版本，flink1.7的发行版默认没有开启，要开启的话，需要将flink-queryable-state-runtime_2.11-1.7.0.jar拷贝到/opt/flink/lib/目录下，这样子task manager启动的时候会打印诸如Started Queryable State Proxy Server @ /172.20.0.3:9069的日志，这样子就可以确认是启用了该功能Queryable State在架构上涉及三个组件，一个是QueryableStateServer，它会在每个task manager上运行，负责本地state存储；一个是QueryableStateClientProxy，它也在每个task manager上运行，负责接收client发来的查询请求，然后从对应的task manager上获取对应的state，然后返回给client；一个是QueryableStateClient，它就是通常是运行在flink cluster之外，用于提交用户的state queryQueryableStateServer以及QueryableStateClientProxy均有ports、network-threads、query-threads的属性可以配置；QueryableStateServer默认的query.server.ports值为9097；QueryableStateClientProxy默认的query.proxy.ports值为9096，client端需要使用这个端口来进行请求声明state为queryable有两个方法，一个是通过KeyedStream.asQueryableState方法转为QueryableStateStream；一个是调用Managed keyed State的StateDescriptor的setQueryable进行声明；这两个的区别在于asQueryableState必须是直接作用于KeyedStream对象，因此KeyedStream就不能做后续的transform操作，类似于sink；而通过StateDescriptor的setQueryable进行声明则相对灵活一点；这里要注意没有queryable ListStateQueryable State目前有几点限制，一个是它生命周期跟task一样，在task运行完的时候就销毁了，没办法查询，后续可能支持在task完成之后查询；一个是目前的KvState的Notifications进行使用tell机制，后续可能改为ack模式；一个是目前query的statistics默认是禁用的，后续可能支持发布到metrics systemdocQueryable State Beta ...

聊聊flink的Broadcast State

序本文主要研究一下flink的Broadcast State实例 @Test public void testBroadcastState() throws Exception { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStreamSource<String> originStream = env.addSource(new RandomWordSource()); MapStateDescriptor<String, String> descriptor = new MapStateDescriptor(“dynamicConfig”, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO); BroadcastStream<Tuple2<String,String>> configStream = env.addSource(new DynamicConfigSource()).broadcast(descriptor); BroadcastConnectedStream<String, Tuple2<String,String>> connectStream = originStream.connect(configStream); connectStream.process(new BroadcastProcessFunction<String, Tuple2<String,String>, Void>() { @Override public void processElement(String value, ReadOnlyContext ctx, Collector<Void> out) throws Exception { ReadOnlyBroadcastState<String,String> config = ctx.getBroadcastState(descriptor); String configValue = config.get(“demoConfigKey”); //do some process base on the config LOGGER.info(“process value:{},config:{}",value,configValue); } @Override public void processBroadcastElement(Tuple2<String, String> value, Context ctx, Collector<Void> out) throws Exception { LOGGER.info(“receive config item:{}",value); //update state ctx.getBroadcastState(descriptor).put(value.getField(0),value.getField(1)); } }); env.execute(“testBroadcastState”); }public class DynamicConfigSource implements SourceFunction<Tuple2<String,String>> { private volatile boolean isRunning = true; @Override public void run(SourceContext<Tuple2<String, String>> ctx) throws Exception { long idx = 1; while (isRunning){ ctx.collect(Tuple2.of(“demoConfigKey”,“value” + idx)); idx++; TimeUnit.SECONDS.sleep(10); } } @Override public void cancel() { isRunning = false; }}这里模拟了一个配置的source，定时去刷新配置，然后broadcast到每个taskMapStateDescriptorflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/MapStateDescriptor.java@PublicEvolvingpublic class MapStateDescriptor<UK, UV> extends StateDescriptor<MapState<UK, UV>, Map<UK, UV>> { private static final long serialVersionUID = 1L; /** * Create a new {@code MapStateDescriptor} with the given name and the given type serializers. * * @param name The name of the {@code MapStateDescriptor}. * @param keySerializer The type serializer for the keys in the state. * @param valueSerializer The type serializer for the values in the state. / public MapStateDescriptor(String name, TypeSerializer<UK> keySerializer, TypeSerializer<UV> valueSerializer) { super(name, new MapSerializer<>(keySerializer, valueSerializer), null); } /* * Create a new {@code MapStateDescriptor} with the given name and the given type information. * * @param name The name of the {@code MapStateDescriptor}. * @param keyTypeInfo The type information for the keys in the state. * @param valueTypeInfo The type information for the values in the state. / public MapStateDescriptor(String name, TypeInformation<UK> keyTypeInfo, TypeInformation<UV> valueTypeInfo) { super(name, new MapTypeInfo<>(keyTypeInfo, valueTypeInfo), null); } /* * Create a new {@code MapStateDescriptor} with the given name and the given type information. * * If this constructor fails (because it is not possible to describe the type via a class), * consider using the {@link #MapStateDescriptor(String, TypeInformation, TypeInformation)} constructor. * * @param name The name of the {@code MapStateDescriptor}. * @param keyClass The class of the type of keys in the state. * @param valueClass The class of the type of values in the state. / public MapStateDescriptor(String name, Class<UK> keyClass, Class<UV> valueClass) { super(name, new MapTypeInfo<>(keyClass, valueClass), null); } @Override public Type getType() { return Type.MAP; } /* * Gets the serializer for the keys in the state. * * @return The serializer for the keys in the state. / public TypeSerializer<UK> getKeySerializer() { final TypeSerializer<Map<UK, UV>> rawSerializer = getSerializer(); if (!(rawSerializer instanceof MapSerializer)) { throw new IllegalStateException(“Unexpected serializer type.”); } return ((MapSerializer<UK, UV>) rawSerializer).getKeySerializer(); } /* * Gets the serializer for the values in the state. * * @return The serializer for the values in the state. / public TypeSerializer<UV> getValueSerializer() { final TypeSerializer<Map<UK, UV>> rawSerializer = getSerializer(); if (!(rawSerializer instanceof MapSerializer)) { throw new IllegalStateException(“Unexpected serializer type.”); } return ((MapSerializer<UK, UV>) rawSerializer).getValueSerializer(); }}MapStateDescriptor继承了StateDescriptor，其中state为MapState类型，value为Map类型DataStream.broadcastflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java /* * Sets the partitioning of the {@link DataStream} so that the output elements * are broadcasted to every parallel instance of the next operation. In addition, * it implicitly as many {@link org.apache.flink.api.common.state.BroadcastState broadcast states} * as the specified descriptors which can be used to store the element of the stream. * * @param broadcastStateDescriptors the descriptors of the broadcast states to create. * @return A {@link BroadcastStream} which can be used in the {@link #connect(BroadcastStream)} to * create a {@link BroadcastConnectedStream} for further processing of the elements. / @PublicEvolving public BroadcastStream<T> broadcast(final MapStateDescriptor<?, ?>… broadcastStateDescriptors) { Preconditions.checkNotNull(broadcastStateDescriptors); final DataStream<T> broadcastStream = setConnectionType(new BroadcastPartitioner<>()); return new BroadcastStream<>(environment, broadcastStream, broadcastStateDescriptors); } /* * Internal function for setting the partitioner for the DataStream. * * @param partitioner * Partitioner to set. * @return The modified DataStream. / protected DataStream<T> setConnectionType(StreamPartitioner<T> partitioner) { return new DataStream<>(this.getExecutionEnvironment(), new PartitionTransformation<>(this.getTransformation(), partitioner)); } /* * Sets the partitioning of the {@link DataStream} so that the output elements * are broadcast to every parallel instance of the next operation. * * @return The DataStream with broadcast partitioning set. / public DataStream<T> broadcast() { return setConnectionType(new BroadcastPartitioner<T>()); }DataStream的broadcast方法，首先调用setConnectionType，然后使用MapStateDescriptor作为参数创建BroadcastStream返回；DataStream也有一个无参的broadcast方法，它直接调用setConnectionType返回DataStreamDataStream.connectflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java /* * Creates a new {@link ConnectedStreams} by connecting * {@link DataStream} outputs of (possible) different types with each other. * The DataStreams connected using this operator can be used with * CoFunctions to apply joint transformations. * * @param dataStream * The DataStream with which this stream will be connected. * @return The {@link ConnectedStreams}. / public <R> ConnectedStreams<T, R> connect(DataStream<R> dataStream) { return new ConnectedStreams<>(environment, this, dataStream); } /* * Creates a new {@link BroadcastConnectedStream} by connecting the current * {@link DataStream} or {@link KeyedStream} with a {@link BroadcastStream}. * * The latter can be created using the {@link #broadcast(MapStateDescriptor[])} method. * * The resulting stream can be further processed using the {@code BroadcastConnectedStream.process(MyFunction)} * method, where {@code MyFunction} can be either a * {@link org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction KeyedBroadcastProcessFunction} * or a {@link org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction BroadcastProcessFunction} * depending on the current stream being a {@link KeyedStream} or not. * * @param broadcastStream The broadcast stream with the broadcast state to be connected with this stream. * @return The {@link BroadcastConnectedStream}. / @PublicEvolving public <R> BroadcastConnectedStream<T, R> connect(BroadcastStream<R> broadcastStream) { return new BroadcastConnectedStream<>( environment, this, Preconditions.checkNotNull(broadcastStream), broadcastStream.getBroadcastStateDescriptor()); }DataStream的connect方法参数可以是DataStream类型，也可以是BroadcastStream类型，如果是BroadcastStream类型则返回的是BroadcastConnectedStream，否则是普通的ConnectedStreamsBroadcastConnectedStream.processflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/BroadcastConnectedStream.java@PublicEvolvingpublic class BroadcastConnectedStream<IN1, IN2> { private final StreamExecutionEnvironment environment; private final DataStream<IN1> inputStream1; private final BroadcastStream<IN2> inputStream2; private final List<MapStateDescriptor<?, ?>> broadcastStateDescriptors; protected BroadcastConnectedStream( final StreamExecutionEnvironment env, final DataStream<IN1> input1, final BroadcastStream<IN2> input2, final List<MapStateDescriptor<?, ?>> broadcastStateDescriptors) { this.environment = requireNonNull(env); this.inputStream1 = requireNonNull(input1); this.inputStream2 = requireNonNull(input2); this.broadcastStateDescriptors = requireNonNull(broadcastStateDescriptors); } public StreamExecutionEnvironment getExecutionEnvironment() { return environment; } /* * Returns the non-broadcast {@link DataStream}. * * @return The stream which, by convention, is not broadcasted. / public DataStream<IN1> getFirstInput() { return inputStream1; } /* * Returns the {@link BroadcastStream}. * * @return The stream which, by convention, is the broadcast one. / public BroadcastStream<IN2> getSecondInput() { return inputStream2; } /* * Gets the type of the first input. * * @return The type of the first input / public TypeInformation<IN1> getType1() { return inputStream1.getType(); } /* * Gets the type of the second input. * * @return The type of the second input / public TypeInformation<IN2> getType2() { return inputStream2.getType(); } /* * Assumes as inputs a {@link BroadcastStream} and a {@link KeyedStream} and applies the given * {@link KeyedBroadcastProcessFunction} on them, thereby creating a transformed output stream. * * @param function The {@link KeyedBroadcastProcessFunction} that is called for each element in the stream. * @param <KS> The type of the keys in the keyed stream. * @param <OUT> The type of the output elements. * @return The transformed {@link DataStream}. / @PublicEvolving public <KS, OUT> SingleOutputStreamOperator<OUT> process(final KeyedBroadcastProcessFunction<KS, IN1, IN2, OUT> function) { TypeInformation<OUT> outTypeInfo = TypeExtractor.getBinaryOperatorReturnType( function, KeyedBroadcastProcessFunction.class, 1, 2, 3, TypeExtractor.NO_INDEX, getType1(), getType2(), Utils.getCallLocationName(), true); return process(function, outTypeInfo); } /* * Assumes as inputs a {@link BroadcastStream} and a {@link KeyedStream} and applies the given * {@link KeyedBroadcastProcessFunction} on them, thereby creating a transformed output stream. * * @param function The {@link KeyedBroadcastProcessFunction} that is called for each element in the stream. * @param outTypeInfo The type of the output elements. * @param <KS> The type of the keys in the keyed stream. * @param <OUT> The type of the output elements. * @return The transformed {@link DataStream}. / @PublicEvolving public <KS, OUT> SingleOutputStreamOperator<OUT> process( final KeyedBroadcastProcessFunction<KS, IN1, IN2, OUT> function, final TypeInformation<OUT> outTypeInfo) { Preconditions.checkNotNull(function); Preconditions.checkArgument(inputStream1 instanceof KeyedStream, “A KeyedBroadcastProcessFunction can only be used on a keyed stream.”); TwoInputStreamOperator<IN1, IN2, OUT> operator = new CoBroadcastWithKeyedOperator<>(clean(function), broadcastStateDescriptors); return transform(“Co-Process-Broadcast-Keyed”, outTypeInfo, operator); } /* * Assumes as inputs a {@link BroadcastStream} and a non-keyed {@link DataStream} and applies the given * {@link BroadcastProcessFunction} on them, thereby creating a transformed output stream. * * @param function The {@link BroadcastProcessFunction} that is called for each element in the stream. * @param <OUT> The type of the output elements. * @return The transformed {@link DataStream}. / @PublicEvolving public <OUT> SingleOutputStreamOperator<OUT> process(final BroadcastProcessFunction<IN1, IN2, OUT> function) { TypeInformation<OUT> outTypeInfo = TypeExtractor.getBinaryOperatorReturnType( function, BroadcastProcessFunction.class, 0, 1, 2, TypeExtractor.NO_INDEX, getType1(), getType2(), Utils.getCallLocationName(), true); return process(function, outTypeInfo); } /* * Assumes as inputs a {@link BroadcastStream} and a non-keyed {@link DataStream} and applies the given * {@link BroadcastProcessFunction} on them, thereby creating a transformed output stream. * * @param function The {@link BroadcastProcessFunction} that is called for each element in the stream. * @param outTypeInfo The type of the output elements. * @param <OUT> The type of the output elements. * @return The transformed {@link DataStream}. */ @PublicEvolving public <OUT> SingleOutputStreamOperator<OUT> process( final BroadcastProcessFunction<IN1, IN2, OUT> function, final TypeInformation<OUT> outTypeInfo) { Preconditions.checkNotNull(function); Preconditions.checkArgument(!(inputStream1 instanceof KeyedStream), “A BroadcastProcessFunction can only be used on a non-keyed stream.”); TwoInputStreamOperator<IN1, IN2, OUT> operator = new CoBroadcastWithNonKeyedOperator<>(clean(function), broadcastStateDescriptors); return transform(“Co-Process-Broadcast”, outTypeInfo, operator); } @Internal private <OUT> SingleOutputStreamOperator<OUT> transform( final String functionName, final TypeInformation<OUT> outTypeInfo, final TwoInputStreamOperator<IN1, IN2, OUT> operator) { // read the output type of the input Transforms to coax out errors about MissingTypeInfo inputStream1.getType(); inputStream2.getType(); TwoInputTransformation<IN1, IN2, OUT> transform = new TwoInputTransformation<>( inputStream1.getTransformation(), inputStream2.getTransformation(), functionName, operator, outTypeInfo, environment.getParallelism()); if (inputStream1 instanceof KeyedStream) { KeyedStream<IN1, ?> keyedInput1 = (KeyedStream<IN1, ?>) inputStream1; TypeInformation<?> keyType1 = keyedInput1.getKeyType(); transform.setStateKeySelectors(keyedInput1.getKeySelector(), null); transform.setStateKeyType(keyType1); } @SuppressWarnings({ “unchecked”, “rawtypes” }) SingleOutputStreamOperator<OUT> returnStream = new SingleOutputStreamOperator(environment, transform); getExecutionEnvironment().addOperator(transform); return returnStream; } protected <F> F clean(F f) { return getExecutionEnvironment().clean(f); }}BroadcastConnectedStream.process接收两种类型的function，一种是KeyedBroadcastProcessFunction，另外一种是BroadcastProcessFunction；它们都定义了processElement、processBroadcastElement抽象方法，只是KeyedBroadcastProcessFunction多定义了一个onTimer方法，默认是空操作，允许子类重写小结对于broadcast的使用有几个步骤，1是建立MapStateDescriptor，然后通过DataStream.broadcast方法返回BroadcastStream；2是需要接受broadcast的stream通过DataStream.connect方法跟BroadcastStream进行连接返回BroadcastConnectedStream；3是通过BroadcastConnectedStream.process方法进行processElement及processBroadcastElement处理BroadcastConnectedStream.process接收两种类型的function，一种是KeyedBroadcastProcessFunction，另外一种是BroadcastProcessFunction；它们都定义了processElement、processBroadcastElement抽象方法，只是KeyedBroadcastProcessFunction多定义了一个onTimer方法，默认是空操作，允许子类重写Broadcast State为map format，它会将state广播到每个task，注意该state并不会跨task传播，对其修改，仅仅是作用在其所在的task；downstream tasks接收到broadcast event的顺序可能不一样，所以依赖其到达顺序来处理element的时候要小心；checkpoint的时候也会checkpoint broadcast state；另外就是Broadcast State只在内存有，没有RocksDB state backenddocThe Broadcast State Pattern ...

聊聊flink的AbstractTtlState

序本文主要研究一下flink的AbstractTtlStateInternalKvStateflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/internal/InternalKvState.java/** * The {@code InternalKvState} is the root of the internal state type hierarchy, similar to the * {@link State} being the root of the public API state hierarchy. * * The internal state classes give access to the namespace getters and setters and access to * additional functionality, like raw value access or state merging. * * The public API state hierarchy is intended to be programmed against by Flink applications. * The internal state hierarchy holds all the auxiliary methods that are used by the runtime and not * intended to be used by user applications. These internal methods are considered of limited use to users and * only confusing, and are usually not regarded as stable across releases. * * Each specific type in the internal state hierarchy extends the type from the public * state hierarchy: * * <pre> * State * | * +——————-InternalKvState * | | * MergingState | * | | * +—————–InternalMergingState * | | * +——–+——+ | * | | | * ReducingState ListState +—–+—————–+ * | | | | * +———–+ +———– —————–InternalListState * | | * +———InternalReducingState * </pre> * * @param <K> The type of key the state is associated to * @param <N> The type of the namespace * @param <V> The type of values kept internally in state /public interface InternalKvState<K, N, V> extends State { TypeSerializer<K> getKeySerializer(); TypeSerializer<N> getNamespaceSerializer(); TypeSerializer<V> getValueSerializer(); void setCurrentNamespace(N namespace); byte[] getSerializedValue( final byte[] serializedKeyAndNamespace, final TypeSerializer<K> safeKeySerializer, final TypeSerializer<N> safeNamespaceSerializer, final TypeSerializer<V> safeValueSerializer) throws Exception;}InternalKvState接口定义内部的kvState要实现的方法，这里主要是getKeySerializer、getNamespaceSerializer、getValueSerializer、setCurrentNamespace、getSerializedValueAbstractTtlStateflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/ttl/AbstractTtlState.java/* * Base class for TTL logic wrappers of state objects. * * @param <K> The type of key the state is associated to * @param <N> The type of the namespace * @param <SV> The type of values kept internally in state without TTL * @param <TTLSV> The type of values kept internally in state with TTL * @param <S> Type of originally wrapped state object /abstract class AbstractTtlState<K, N, SV, TTLSV, S extends InternalKvState<K, N, TTLSV>> extends AbstractTtlDecorator<S> implements InternalKvState<K, N, SV> { private final TypeSerializer<SV> valueSerializer; AbstractTtlState(S original, StateTtlConfig config, TtlTimeProvider timeProvider, TypeSerializer<SV> valueSerializer) { super(original, config, timeProvider); this.valueSerializer = valueSerializer; } <SE extends Throwable, CE extends Throwable, T> T getWithTtlCheckAndUpdate( SupplierWithException<TtlValue<T>, SE> getter, ThrowingConsumer<TtlValue<T>, CE> updater) throws SE, CE { return getWithTtlCheckAndUpdate(getter, updater, original::clear); } @Override public TypeSerializer<K> getKeySerializer() { return original.getKeySerializer(); } @Override public TypeSerializer<N> getNamespaceSerializer() { return original.getNamespaceSerializer(); } @Override public TypeSerializer<SV> getValueSerializer() { return valueSerializer; } @Override public void setCurrentNamespace(N namespace) { original.setCurrentNamespace(namespace); } @Override public byte[] getSerializedValue( byte[] serializedKeyAndNamespace, TypeSerializer<K> safeKeySerializer, TypeSerializer<N> safeNamespaceSerializer, TypeSerializer<SV> safeValueSerializer) { throw new FlinkRuntimeException(“Queryable state is not currently supported with TTL.”); } @Override public void clear() { original.clear(); }}AbstractTtlState实现了InternalKvState接口的方法，同时继承了AbstractTtlDecorator；它提供了getWithTtlCheckAndUpdate方法，该方法主要是调用AbstractTtlDecorator的getWithTtlCheckAndUpdate来实现TTL逻辑AbstractTtlDecoratorflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/ttl/AbstractTtlDecorator.java/* * Base class for TTL logic wrappers. * * @param <T> Type of originally wrapped object /abstract class AbstractTtlDecorator<T> { /* Wrapped original state handler. / final T original; final StateTtlConfig config; final TtlTimeProvider timeProvider; /* Whether to renew expiration timestamp on state read access. / final boolean updateTsOnRead; /* Whether to renew expiration timestamp on state read access. / final boolean returnExpired; /* State value time to live in milliseconds. / final long ttl; AbstractTtlDecorator( T original, StateTtlConfig config, TtlTimeProvider timeProvider) { Preconditions.checkNotNull(original); Preconditions.checkNotNull(config); Preconditions.checkNotNull(timeProvider); this.original = original; this.config = config; this.timeProvider = timeProvider; this.updateTsOnRead = config.getUpdateType() == StateTtlConfig.UpdateType.OnReadAndWrite; this.returnExpired = config.getStateVisibility() == StateTtlConfig.StateVisibility.ReturnExpiredIfNotCleanedUp; this.ttl = config.getTtl().toMilliseconds(); } <V> V getUnexpired(TtlValue<V> ttlValue) { return ttlValue == null || (expired(ttlValue) && !returnExpired) ? null : ttlValue.getUserValue(); } <V> boolean expired(TtlValue<V> ttlValue) { return TtlUtils.expired(ttlValue, ttl, timeProvider); } <V> TtlValue<V> wrapWithTs(V value) { return TtlUtils.wrapWithTs(value, timeProvider.currentTimestamp()); } <V> TtlValue<V> rewrapWithNewTs(TtlValue<V> ttlValue) { return wrapWithTs(ttlValue.getUserValue()); } <SE extends Throwable, CE extends Throwable, CLE extends Throwable, V> V getWithTtlCheckAndUpdate( SupplierWithException<TtlValue<V>, SE> getter, ThrowingConsumer<TtlValue<V>, CE> updater, ThrowingRunnable<CLE> stateClear) throws SE, CE, CLE { TtlValue<V> ttlValue = getWrappedWithTtlCheckAndUpdate(getter, updater, stateClear); return ttlValue == null ? null : ttlValue.getUserValue(); } <SE extends Throwable, CE extends Throwable, CLE extends Throwable, V> TtlValue<V> getWrappedWithTtlCheckAndUpdate( SupplierWithException<TtlValue<V>, SE> getter, ThrowingConsumer<TtlValue<V>, CE> updater, ThrowingRunnable<CLE> stateClear) throws SE, CE, CLE { TtlValue<V> ttlValue = getter.get(); if (ttlValue == null) { return null; } else if (expired(ttlValue)) { stateClear.run(); if (!returnExpired) { return null; } } else if (updateTsOnRead) { updater.accept(rewrapWithNewTs(ttlValue)); } return ttlValue; }}AbstractTtlDecorator对TTL逻辑进行了封装，其主要的逻辑在getWrappedWithTtlCheckAndUpdate方法，它在每次访问的时候对于非null的value会先判断下是否expired(TtlUtils.expired(ttlValue, ttl, timeProvider))，如果过期了则调用stateClear(ThrowingRunnable类型，这里是original::clear)，对于非returnExpired的则直接返回null；对于没有expired的，则判断是否updateTsOnRead，若是则调用updater进行处理，最后返回ttlValueTtlUtils.expiredflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/ttl/TtlUtils.java/* Common functions related to State TTL. /class TtlUtils { static <V> boolean expired(@Nullable TtlValue<V> ttlValue, long ttl, TtlTimeProvider timeProvider) { return expired(ttlValue, ttl, timeProvider.currentTimestamp()); } static <V> boolean expired(@Nullable TtlValue<V> ttlValue, long ttl, long currentTimestamp) { return ttlValue != null && expired(ttlValue.getLastAccessTimestamp(), ttl, currentTimestamp); } private static boolean expired(long ts, long ttl, long currentTimestamp) { return getExpirationTimestamp(ts, ttl) <= currentTimestamp; } private static long getExpirationTimestamp(long ts, long ttl) { long ttlWithoutOverflow = ts > 0 ? Math.min(Long.MAX_VALUE - ts, ttl) : ttl; return ts + ttlWithoutOverflow; } //……}TtlUtils的expired方法主要是通过getExpirationTimestamp获取过期时间，然后跟currentTimestamp进行比较；而getExpirationTimestamp这里是根据ttlValue.getLastAccessTimestamp()及ttl值进行判断，这里利用Long.MAX_VALUE处理了overflow的情况，防止最后的值超出long类型的最大范围ThrowingRunnableflink-core-1.7.0-sources.jar!/org/apache/flink/util/function/ThrowingRunnable.java/* * Similar to a {@link Runnable}, this interface is used to capture a block of code * to be executed. In contrast to {@code Runnable}, this interface allows throwing * checked exceptions. /@PublicEvolving@FunctionalInterfacepublic interface ThrowingRunnable<E extends Throwable> { /* * The work method. * * @throws E Exceptions may be thrown. / void run() throws E; /* * Converts a {@link ThrowingRunnable} into a {@link Runnable} which throws all checked exceptions * as unchecked. * * @param throwingRunnable to convert into a {@link Runnable} * @return {@link Runnable} which throws all checked exceptions as unchecked. */ static Runnable unchecked(ThrowingRunnable<?> throwingRunnable) { return () -> { try { throwingRunnable.run(); } catch (Throwable t) { ExceptionUtils.rethrow(t); } }; }}stateClear是ThrowingRunnable类型，它与Runnable不同，ThrowingRunnable允许抛出checked exceptions，它提供了一个unchecked的静态方法，用于将非Error及非RuntimeException的转为RuntimeException抛出来，从而将ThrowingRunnable转换为Runnable小结InternalKvState接口定义内部的kvState要实现的方法，这里主要是getKeySerializer、getNamespaceSerializer、getValueSerializer、setCurrentNamespace、getSerializedValueAbstractTtlState实现了InternalKvState接口的方法，同时继承了AbstractTtlDecorator；它提供了getWithTtlCheckAndUpdate方法，该方法主要是调用AbstractTtlDecorator的getWithTtlCheckAndUpdate来实现TTL逻辑AbstractTtlDecorator的getWrappedWithTtlCheckAndUpdate方法，在每次访问的时候对于非null的value会先判断下是否expired(TtlUtils.expired(ttlValue, ttl, timeProvider))，如果过期了则调用stateClear(ThrowingRunnable类型，这里是original::clear)，对于非returnExpired的则直接返回null；对于没有expired的，则判断是否updateTsOnRead，若是则调用updater进行处理，最后返回ttlValuedocState Time-To-Live (TTL) ...

聊聊flink的StateTtlConfig

序本文主要研究一下flink的StateTtlConfig实例import org.apache.flink.api.common.state.StateTtlConfig;import org.apache.flink.api.common.state.ValueStateDescriptor;import org.apache.flink.api.common.time.Time;StateTtlConfig ttlConfig = StateTtlConfig .newBuilder(Time.seconds(1)) .setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite) .setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired) .build(); ValueStateDescriptor<String> stateDescriptor = new ValueStateDescriptor<>(“text state”, String.class);stateDescriptor.enableTimeToLive(ttlConfig);这里利用builder创建StateTtlConfig，之后通过StateDescriptor的enableTimeToLive方法传递该configStateTtlConfigflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/StateTtlConfig.java/** * Configuration of state TTL logic. * * Note: The map state with TTL currently supports {@code null} user values * only if the user value serializer can handle {@code null} values. * If the serializer does not support {@code null} values, * it can be wrapped with {@link org.apache.flink.api.java.typeutils.runtime.NullableSerializer} * at the cost of an extra byte in the serialized form. /public class StateTtlConfig implements Serializable { private static final long serialVersionUID = -7592693245044289793L; public static final StateTtlConfig DISABLED = newBuilder(Time.milliseconds(Long.MAX_VALUE)).setUpdateType(UpdateType.Disabled).build(); /* * This option value configures when to update last access timestamp which prolongs state TTL. / public enum UpdateType { /* TTL is disabled. State does not expire. / Disabled, /* Last access timestamp is initialised when state is created and updated on every write operation. / OnCreateAndWrite, /* The same as <code>OnCreateAndWrite</code> but also updated on read. / OnReadAndWrite } /* * This option configures whether expired user value can be returned or not. / public enum StateVisibility { /* Return expired user value if it is not cleaned up yet. / ReturnExpiredIfNotCleanedUp, /* Never return expired user value. / NeverReturnExpired } /* * This option configures time scale to use for ttl. / public enum TimeCharacteristic { /* Processing time, see also <code>TimeCharacteristic.ProcessingTime</code>. / ProcessingTime } private final UpdateType updateType; private final StateVisibility stateVisibility; private final TimeCharacteristic timeCharacteristic; private final Time ttl; private final CleanupStrategies cleanupStrategies; private StateTtlConfig( UpdateType updateType, StateVisibility stateVisibility, TimeCharacteristic timeCharacteristic, Time ttl, CleanupStrategies cleanupStrategies) { this.updateType = Preconditions.checkNotNull(updateType); this.stateVisibility = Preconditions.checkNotNull(stateVisibility); this.timeCharacteristic = Preconditions.checkNotNull(timeCharacteristic); this.ttl = Preconditions.checkNotNull(ttl); this.cleanupStrategies = cleanupStrategies; Preconditions.checkArgument(ttl.toMilliseconds() > 0, “TTL is expected to be positive”); } @Nonnull public UpdateType getUpdateType() { return updateType; } @Nonnull public StateVisibility getStateVisibility() { return stateVisibility; } @Nonnull public Time getTtl() { return ttl; } @Nonnull public TimeCharacteristic getTimeCharacteristic() { return timeCharacteristic; } public boolean isEnabled() { return updateType != UpdateType.Disabled; } @Nonnull public CleanupStrategies getCleanupStrategies() { return cleanupStrategies; } @Override public String toString() { return “StateTtlConfig{” + “updateType=” + updateType + “, stateVisibility=” + stateVisibility + “, timeCharacteristic=” + timeCharacteristic + “, ttl=” + ttl + ‘}’; } @Nonnull public static Builder newBuilder(@Nonnull Time ttl) { return new Builder(ttl); } /* * Builder for the {@link StateTtlConfig}. / public static class Builder { private UpdateType updateType = OnCreateAndWrite; private StateVisibility stateVisibility = NeverReturnExpired; private TimeCharacteristic timeCharacteristic = ProcessingTime; private Time ttl; private CleanupStrategies cleanupStrategies = new CleanupStrategies(); public Builder(@Nonnull Time ttl) { this.ttl = ttl; } /* * Sets the ttl update type. * * @param updateType The ttl update type configures when to update last access timestamp which prolongs state TTL. / @Nonnull public Builder setUpdateType(UpdateType updateType) { this.updateType = updateType; return this; } @Nonnull public Builder updateTtlOnCreateAndWrite() { return setUpdateType(UpdateType.OnCreateAndWrite); } @Nonnull public Builder updateTtlOnReadAndWrite() { return setUpdateType(UpdateType.OnReadAndWrite); } /* * Sets the state visibility. * * @param stateVisibility The state visibility configures whether expired user value can be returned or not. / @Nonnull public Builder setStateVisibility(@Nonnull StateVisibility stateVisibility) { this.stateVisibility = stateVisibility; return this; } @Nonnull public Builder returnExpiredIfNotCleanedUp() { return setStateVisibility(StateVisibility.ReturnExpiredIfNotCleanedUp); } @Nonnull public Builder neverReturnExpired() { return setStateVisibility(StateVisibility.NeverReturnExpired); } /* * Sets the time characteristic. * * @param timeCharacteristic The time characteristic configures time scale to use for ttl. / @Nonnull public Builder setTimeCharacteristic(@Nonnull TimeCharacteristic timeCharacteristic) { this.timeCharacteristic = timeCharacteristic; return this; } @Nonnull public Builder useProcessingTime() { return setTimeCharacteristic(TimeCharacteristic.ProcessingTime); } /* Cleanup expired state in full snapshot on checkpoint. / @Nonnull public Builder cleanupFullSnapshot() { cleanupStrategies.strategies.put( CleanupStrategies.Strategies.FULL_STATE_SCAN_SNAPSHOT, new CleanupStrategies.CleanupStrategy() { }); return this; } /* * Sets the ttl time. * @param ttl The ttl time. / @Nonnull public Builder setTtl(@Nonnull Time ttl) { this.ttl = ttl; return this; } @Nonnull public StateTtlConfig build() { return new StateTtlConfig( updateType, stateVisibility, timeCharacteristic, ttl, cleanupStrategies); } } /* * TTL cleanup strategies. * * This class configures when to cleanup expired state with TTL. * By default, state is always cleaned up on explicit read access if found expired. * Currently cleanup of state full snapshot can be additionally activated. / public static class CleanupStrategies implements Serializable { private static final long serialVersionUID = -1617740467277313524L; /* Fixed strategies ordinals in {@code strategies} config field. / enum Strategies { FULL_STATE_SCAN_SNAPSHOT } /* Base interface for cleanup strategies configurations. / interface CleanupStrategy extends Serializable { } final EnumMap<Strategies, CleanupStrategy> strategies = new EnumMap<>(Strategies.class); public boolean inFullSnapshot() { return strategies.containsKey(Strategies.FULL_STATE_SCAN_SNAPSHOT); } }}StateTtlConfig用于设置state的TTL属性，这里定义了三个枚举，分别是UpdateType(Disabled、OnCreateAndWrite、OnReadAndWrite)、StateVisibility(ReturnExpiredIfNotCleanedUp、NeverReturnExpired)、TimeCharacteristic(ProcessingTime)StateTtlConfig定义了CleanupStrategies，即TTL state的清理策略，默认在读取到expired的state时会进行清理，目前还额外提供在FULL_STATE_SCAN_SNAPSHOT的时候进行清理(在checkpoint时清理full snapshot中的expired state)的选项StateTtlConfig还提供了一个Builder，用于快速设置UpdateType、StateVisibility、TimeCharacteristic、Time、CleanupStrategiesAbstractKeyedStateBackendflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/AbstractKeyedStateBackend.java /* * @see KeyedStateBackend / @Override @SuppressWarnings(“unchecked”) public <N, S extends State, V> S getOrCreateKeyedState( final TypeSerializer<N> namespaceSerializer, StateDescriptor<S, V> stateDescriptor) throws Exception { checkNotNull(namespaceSerializer, “Namespace serializer”); checkNotNull(keySerializer, “State key serializer has not been configured in the config. " + “This operation cannot use partitioned state.”); InternalKvState<K, ?, ?> kvState = keyValueStatesByName.get(stateDescriptor.getName()); if (kvState == null) { if (!stateDescriptor.isSerializerInitialized()) { stateDescriptor.initializeSerializerUnlessSet(executionConfig); } kvState = TtlStateFactory.createStateAndWrapWithTtlIfEnabled( namespaceSerializer, stateDescriptor, this, ttlTimeProvider); keyValueStatesByName.put(stateDescriptor.getName(), kvState); publishQueryableStateIfEnabled(stateDescriptor, kvState); } return (S) kvState; }AbstractKeyedStateBackend的getOrCreateKeyedState方法里头使用TtlStateFactory.createStateAndWrapWithTtlIfEnabled来创建InternalKvStateTtlStateFactoryflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/ttl/TtlStateFactory.java/* * This state factory wraps state objects, produced by backends, with TTL logic. */public class TtlStateFactory<N, SV, S extends State, IS extends S> { public static <N, SV, S extends State, IS extends S> IS createStateAndWrapWithTtlIfEnabled( TypeSerializer<N> namespaceSerializer, StateDescriptor<S, SV> stateDesc, KeyedStateFactory originalStateFactory, TtlTimeProvider timeProvider) throws Exception { Preconditions.checkNotNull(namespaceSerializer); Preconditions.checkNotNull(stateDesc); Preconditions.checkNotNull(originalStateFactory); Preconditions.checkNotNull(timeProvider); return stateDesc.getTtlConfig().isEnabled() ? new TtlStateFactory<N, SV, S, IS>( namespaceSerializer, stateDesc, originalStateFactory, timeProvider) .createState() : originalStateFactory.createInternalState(namespaceSerializer, stateDesc); } private final Map<Class<? extends StateDescriptor>, SupplierWithException<IS, Exception>> stateFactories; private final TypeSerializer<N> namespaceSerializer; private final StateDescriptor<S, SV> stateDesc; private final KeyedStateFactory originalStateFactory; private final StateTtlConfig ttlConfig; private final TtlTimeProvider timeProvider; private final long ttl; private TtlStateFactory( TypeSerializer<N> namespaceSerializer, StateDescriptor<S, SV> stateDesc, KeyedStateFactory originalStateFactory, TtlTimeProvider timeProvider) { this.namespaceSerializer = namespaceSerializer; this.stateDesc = stateDesc; this.originalStateFactory = originalStateFactory; this.ttlConfig = stateDesc.getTtlConfig(); this.timeProvider = timeProvider; this.ttl = ttlConfig.getTtl().toMilliseconds(); this.stateFactories = createStateFactories(); } private Map<Class<? extends StateDescriptor>, SupplierWithException<IS, Exception>> createStateFactories() { return Stream.of( Tuple2.of(ValueStateDescriptor.class, (SupplierWithException<IS, Exception>) this::createValueState), Tuple2.of(ListStateDescriptor.class, (SupplierWithException<IS, Exception>) this::createListState), Tuple2.of(MapStateDescriptor.class, (SupplierWithException<IS, Exception>) this::createMapState), Tuple2.of(ReducingStateDescriptor.class, (SupplierWithException<IS, Exception>) this::createReducingState), Tuple2.of(AggregatingStateDescriptor.class, (SupplierWithException<IS, Exception>) this::createAggregatingState), Tuple2.of(FoldingStateDescriptor.class, (SupplierWithException<IS, Exception>) this::createFoldingState) ).collect(Collectors.toMap(t -> t.f0, t -> t.f1)); } private IS createState() throws Exception { SupplierWithException<IS, Exception> stateFactory = stateFactories.get(stateDesc.getClass()); if (stateFactory == null) { String message = String.format(“State %s is not supported by %s”, stateDesc.getClass(), TtlStateFactory.class); throw new FlinkRuntimeException(message); } return stateFactory.get(); } @SuppressWarnings(“unchecked”) private IS createValueState() throws Exception { ValueStateDescriptor<TtlValue<SV>> ttlDescriptor = new ValueStateDescriptor<>( stateDesc.getName(), new TtlSerializer<>(stateDesc.getSerializer())); return (IS) new TtlValueState<>( originalStateFactory.createInternalState(namespaceSerializer, ttlDescriptor, getSnapshotTransformFactory()), ttlConfig, timeProvider, stateDesc.getSerializer()); } @SuppressWarnings(“unchecked”) private <T> IS createListState() throws Exception { ListStateDescriptor<T> listStateDesc = (ListStateDescriptor<T>) stateDesc; ListStateDescriptor<TtlValue<T>> ttlDescriptor = new ListStateDescriptor<>( stateDesc.getName(), new TtlSerializer<>(listStateDesc.getElementSerializer())); return (IS) new TtlListState<>( originalStateFactory.createInternalState( namespaceSerializer, ttlDescriptor, getSnapshotTransformFactory()), ttlConfig, timeProvider, listStateDesc.getSerializer()); } @SuppressWarnings(“unchecked”) private <UK, UV> IS createMapState() throws Exception { MapStateDescriptor<UK, UV> mapStateDesc = (MapStateDescriptor<UK, UV>) stateDesc; MapStateDescriptor<UK, TtlValue<UV>> ttlDescriptor = new MapStateDescriptor<>( stateDesc.getName(), mapStateDesc.getKeySerializer(), new TtlSerializer<>(mapStateDesc.getValueSerializer())); return (IS) new TtlMapState<>( originalStateFactory.createInternalState(namespaceSerializer, ttlDescriptor, getSnapshotTransformFactory()), ttlConfig, timeProvider, mapStateDesc.getSerializer()); } @SuppressWarnings(“unchecked”) private IS createReducingState() throws Exception { ReducingStateDescriptor<SV> reducingStateDesc = (ReducingStateDescriptor<SV>) stateDesc; ReducingStateDescriptor<TtlValue<SV>> ttlDescriptor = new ReducingStateDescriptor<>( stateDesc.getName(), new TtlReduceFunction<>(reducingStateDesc.getReduceFunction(), ttlConfig, timeProvider), new TtlSerializer<>(stateDesc.getSerializer())); return (IS) new TtlReducingState<>( originalStateFactory.createInternalState(namespaceSerializer, ttlDescriptor, getSnapshotTransformFactory()), ttlConfig, timeProvider, stateDesc.getSerializer()); } @SuppressWarnings(“unchecked”) private <IN, OUT> IS createAggregatingState() throws Exception { AggregatingStateDescriptor<IN, SV, OUT> aggregatingStateDescriptor = (AggregatingStateDescriptor<IN, SV, OUT>) stateDesc; TtlAggregateFunction<IN, SV, OUT> ttlAggregateFunction = new TtlAggregateFunction<>( aggregatingStateDescriptor.getAggregateFunction(), ttlConfig, timeProvider); AggregatingStateDescriptor<IN, TtlValue<SV>, OUT> ttlDescriptor = new AggregatingStateDescriptor<>( stateDesc.getName(), ttlAggregateFunction, new TtlSerializer<>(stateDesc.getSerializer())); return (IS) new TtlAggregatingState<>( originalStateFactory.createInternalState(namespaceSerializer, ttlDescriptor, getSnapshotTransformFactory()), ttlConfig, timeProvider, stateDesc.getSerializer(), ttlAggregateFunction); } @SuppressWarnings({“deprecation”, “unchecked”}) private <T> IS createFoldingState() throws Exception { FoldingStateDescriptor<T, SV> foldingStateDescriptor = (FoldingStateDescriptor<T, SV>) stateDesc; SV initAcc = stateDesc.getDefaultValue(); TtlValue<SV> ttlInitAcc = initAcc == null ? null : new TtlValue<>(initAcc, Long.MAX_VALUE); FoldingStateDescriptor<T, TtlValue<SV>> ttlDescriptor = new FoldingStateDescriptor<>( stateDesc.getName(), ttlInitAcc, new TtlFoldFunction<>(foldingStateDescriptor.getFoldFunction(), ttlConfig, timeProvider, initAcc), new TtlSerializer<>(stateDesc.getSerializer())); return (IS) new TtlFoldingState<>( originalStateFactory.createInternalState(namespaceSerializer, ttlDescriptor, getSnapshotTransformFactory()), ttlConfig, timeProvider, stateDesc.getSerializer()); } //……}TtlStateFactory的createStateAndWrapWithTtlIfEnabled方法这里会根据stateDesc.getTtlConfig().isEnabled()来创建state，如果开启ttl则调用new TtlStateFactory<N, SV, S, IS>(namespaceSerializer, stateDesc, originalStateFactory, timeProvider).createState()，否则调用originalStateFactory.createInternalState(namespaceSerializer, stateDesc)这里createStateFactories创建了不同类型的StateDescriptor对应创建方法的map，在createState的时候，根据指定类型自动调用对应的SupplierWithException，省去if else的判断ValueStateDescriptor对应createValueState方法，创建的是TtlValueState；ListStateDescriptor对应createListState方法，创建的是TtlListState；MapStateDescriptor对应createMapState方法，创建的是TtlMapState；ReducingStateDescriptor对应createReducingState方法，创建的是TtlReducingState；AggregatingStateDescriptor对应createAggregatingState方法，创建的是TtlAggregatingState；FoldingStateDescriptor对应createFoldingState方法，创建的是TtlFoldingState小结StateTtlConfig用于设置state的TTL属性，这里主要设置UpdateType、StateVisibility、TimeCharacteristic、Time、CleanupStrategies这几个属性AbstractKeyedStateBackend的getOrCreateKeyedState方法里头使用TtlStateFactory.createStateAndWrapWithTtlIfEnabled来创建InternalKvStateTtlStateFactory的createStateAndWrapWithTtlIfEnabled方法这里会根据stateDesc.getTtlConfig().isEnabled()来创建对应的state；TtlStateFactory的createState会根据不同类型的StateDescriptor创建对应类型的ttl statedocState Time-To-Live (TTL) ...

[case51]聊聊flink的StateDescriptor

序本文主要研究一下flink的StateDescriptorRuntimeContext.getStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/functions/RuntimeContext.java/** * A RuntimeContext contains information about the context in which functions are executed. Each parallel instance * of the function will have a context through which it can access static contextual information (such as * the current parallelism) and other constructs like accumulators and broadcast variables. * * A function can, during runtime, obtain the RuntimeContext via a call to * {@link AbstractRichFunction#getRuntimeContext()}. /@Publicpublic interface RuntimeContext { //…… @PublicEvolving <T> ValueState<T> getState(ValueStateDescriptor<T> stateProperties); @PublicEvolving <T> ListState<T> getListState(ListStateDescriptor<T> stateProperties); @PublicEvolving <T> ReducingState<T> getReducingState(ReducingStateDescriptor<T> stateProperties); @PublicEvolving <IN, ACC, OUT> AggregatingState<IN, OUT> getAggregatingState(AggregatingStateDescriptor<IN, ACC, OUT> stateProperties); @PublicEvolving @Deprecated <T, ACC> FoldingState<T, ACC> getFoldingState(FoldingStateDescriptor<T, ACC> stateProperties); @PublicEvolving <UK, UV> MapState<UK, UV> getMapState(MapStateDescriptor<UK, UV> stateProperties);}RuntimeContext针对各种state提供了根据对应StateDescriptor的get方法，比如提供了getState方法，通过ValueStateDescriptor参数来获取ValueState；getListState通过ListStateDescriptor获取ListState；getReducingState通过ReducingStateDescriptor获取ReducingState；getAggregatingState通过AggregatingStateDescriptor获取AggregatingState；getFoldingState通过FoldingStateDescriptor获取FoldingState；getMapState通过MapStateDescriptor获取MapStateStateDescriptorflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/StateDescriptor.java/* * Base class for state descriptors. A {@code StateDescriptor} is used for creating partitioned * {@link State} in stateful operations. * * Subclasses must correctly implement {@link #equals(Object)} and {@link #hashCode()}. * * @param <S> The type of the State objects created from this {@code StateDescriptor}. * @param <T> The type of the value of the state object described by this state descriptor. /@PublicEvolvingpublic abstract class StateDescriptor<S extends State, T> implements Serializable { /* * An enumeration of the types of supported states. Used to identify the state type * when writing and restoring checkpoints and savepoints. / // IMPORTANT: Do not change the order of the elements in this enum, ordinal is used in serialization public enum Type { /* * @deprecated Enum for migrating from old checkpoints/savepoint versions. / @Deprecated UNKNOWN, VALUE, LIST, REDUCING, FOLDING, AGGREGATING, MAP } private static final long serialVersionUID = 1L; // ———————————————————————— /* Name that uniquely identifies state created from this StateDescriptor. / protected final String name; /* The serializer for the type. May be eagerly initialized in the constructor, * or lazily once the {@link #initializeSerializerUnlessSet(ExecutionConfig)} method * is called. / @Nullable protected TypeSerializer<T> serializer; /* The type information describing the value type. Only used to if the serializer * is created lazily. / @Nullable private TypeInformation<T> typeInfo; /* Name for queries against state created from this StateDescriptor. / @Nullable private String queryableStateName; /* Name for queries against state created from this StateDescriptor. / @Nonnull private StateTtlConfig ttlConfig = StateTtlConfig.DISABLED; /* The default value returned by the state when no other value is bound to a key. / @Nullable protected transient T defaultValue; // ———————————————————————— /* * Create a new {@code StateDescriptor} with the given name and the given type serializer. * * @param name The name of the {@code StateDescriptor}. * @param serializer The type serializer for the values in the state. * @param defaultValue The default value that will be set when requesting state without setting * a value before. / protected StateDescriptor(String name, TypeSerializer<T> serializer, @Nullable T defaultValue) { this.name = checkNotNull(name, “name must not be null”); this.serializer = checkNotNull(serializer, “serializer must not be null”); this.defaultValue = defaultValue; } /* * Create a new {@code StateDescriptor} with the given name and the given type information. * * @param name The name of the {@code StateDescriptor}. * @param typeInfo The type information for the values in the state. * @param defaultValue The default value that will be set when requesting state without setting * a value before. / protected StateDescriptor(String name, TypeInformation<T> typeInfo, @Nullable T defaultValue) { this.name = checkNotNull(name, “name must not be null”); this.typeInfo = checkNotNull(typeInfo, “type information must not be null”); this.defaultValue = defaultValue; } /* * Create a new {@code StateDescriptor} with the given name and the given type information. * * If this constructor fails (because it is not possible to describe the type via a class), * consider using the {@link #StateDescriptor(String, TypeInformation, Object)} constructor. * * @param name The name of the {@code StateDescriptor}. * @param type The class of the type of values in the state. * @param defaultValue The default value that will be set when requesting state without setting * a value before. / protected StateDescriptor(String name, Class<T> type, @Nullable T defaultValue) { this.name = checkNotNull(name, “name must not be null”); checkNotNull(type, “type class must not be null”); try { this.typeInfo = TypeExtractor.createTypeInfo(type); } catch (Exception e) { throw new RuntimeException( “Could not create the type information for ‘” + type.getName() + “’. " + “The most common reason is failure to infer the generic type information, due to Java’s type erasure. " + “In that case, please pass a ‘TypeHint’ instead of a class to describe the type. " + “For example, to describe ‘Tuple2<String, String>’ as a generic type, use " + “’new PravegaDeserializationSchema<>(new TypeHint<Tuple2<String, String>>(){}, serializer);’”, e); } this.defaultValue = defaultValue; } // ———————————————————————— /* * Returns the name of this {@code StateDescriptor}. / public String getName() { return name; } /* * Returns the default value. / public T getDefaultValue() { if (defaultValue != null) { if (serializer != null) { return serializer.copy(defaultValue); } else { throw new IllegalStateException(“Serializer not yet initialized.”); } } else { return null; } } /* * Returns the {@link TypeSerializer} that can be used to serialize the value in the state. * Note that the serializer may initialized lazily and is only guaranteed to exist after * calling {@link #initializeSerializerUnlessSet(ExecutionConfig)}. / public TypeSerializer<T> getSerializer() { if (serializer != null) { return serializer.duplicate(); } else { throw new IllegalStateException(“Serializer not yet initialized.”); } } /* * Sets the name for queries of state created from this descriptor. * * If a name is set, the created state will be published for queries * during runtime. The name needs to be unique per job. If there is another * state instance published under the same name, the job will fail during runtime. * * @param queryableStateName State name for queries (unique name per job) * @throws IllegalStateException If queryable state name already set / public void setQueryable(String queryableStateName) { Preconditions.checkArgument( ttlConfig.getUpdateType() == StateTtlConfig.UpdateType.Disabled, “Queryable state is currently not supported with TTL”); if (this.queryableStateName == null) { this.queryableStateName = Preconditions.checkNotNull(queryableStateName, “Registration name”); } else { throw new IllegalStateException(“Queryable state name already set”); } } /* * Returns the queryable state name. * * @return Queryable state name or <code>null</code> if not set. / @Nullable public String getQueryableStateName() { return queryableStateName; } /* * Returns whether the state created from this descriptor is queryable. * * @return <code>true</code> if state is queryable, <code>false</code> * otherwise. / public boolean isQueryable() { return queryableStateName != null; } /* * Configures optional activation of state time-to-live (TTL). * * State user value will expire, become unavailable and be cleaned up in storage * depending on configured {@link StateTtlConfig}. * * @param ttlConfig configuration of state TTL / public void enableTimeToLive(StateTtlConfig ttlConfig) { Preconditions.checkNotNull(ttlConfig); Preconditions.checkArgument( ttlConfig.getUpdateType() != StateTtlConfig.UpdateType.Disabled && queryableStateName == null, “Queryable state is currently not supported with TTL”); this.ttlConfig = ttlConfig; } @Nonnull @Internal public StateTtlConfig getTtlConfig() { return ttlConfig; } // ———————————————————————— /* * Checks whether the serializer has been initialized. Serializer initialization is lazy, * to allow parametrization of serializers with an {@link ExecutionConfig} via * {@link #initializeSerializerUnlessSet(ExecutionConfig)}. * * @return True if the serializers have been initialized, false otherwise. / public boolean isSerializerInitialized() { return serializer != null; } /* * Initializes the serializer, unless it has been initialized before. * * @param executionConfig The execution config to use when creating the serializer. */ public void initializeSerializerUnlessSet(ExecutionConfig executionConfig) { if (serializer == null) { checkState(typeInfo != null, “no serializer and no type info”); // instantiate the serializer serializer = typeInfo.createSerializer(executionConfig); // we can drop the type info now, no longer needed typeInfo = null; } } // ———————————————————————— // Standard Utils // ———————————————————————— @Override public final int hashCode() { return name.hashCode() + 31 * getClass().hashCode(); } @Override public final boolean equals(Object o) { if (o == this) { return true; } else if (o != null && o.getClass() == this.getClass()) { final StateDescriptor<?, ?> that = (StateDescriptor<?, ?>) o; return this.name.equals(that.name); } else { return false; } } @Override public String toString() { return getClass().getSimpleName() + “{name=” + name + “, defaultValue=” + defaultValue + “, serializer=” + serializer + (isQueryable() ? “, queryableStateName=” + queryableStateName + "” : “”) + ‘}’; } public abstract Type getType(); // ———————————————————————— // Serialization // ———————————————————————— private void writeObject(final ObjectOutputStream out) throws IOException { // write all the non-transient fields out.defaultWriteObject(); // write the non-serializable default value field if (defaultValue == null) { // we don’t have a default value out.writeBoolean(false); } else { // we have a default value out.writeBoolean(true); byte[] serializedDefaultValue; try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputViewStreamWrapper outView = new DataOutputViewStreamWrapper(baos)) { TypeSerializer<T> duplicateSerializer = serializer.duplicate(); duplicateSerializer.serialize(defaultValue, outView); outView.flush(); serializedDefaultValue = baos.toByteArray(); } catch (Exception e) { throw new IOException(“Unable to serialize default value of type " + defaultValue.getClass().getSimpleName() + “.”, e); } out.writeInt(serializedDefaultValue.length); out.write(serializedDefaultValue); } } private void readObject(final ObjectInputStream in) throws IOException, ClassNotFoundException { // read the non-transient fields in.defaultReadObject(); // read the default value field boolean hasDefaultValue = in.readBoolean(); if (hasDefaultValue) { int size = in.readInt(); byte[] buffer = new byte[size]; in.readFully(buffer); try (ByteArrayInputStream bais = new ByteArrayInputStream(buffer); DataInputViewStreamWrapper inView = new DataInputViewStreamWrapper(bais)) { defaultValue = serializer.deserialize(inView); } catch (Exception e) { throw new IOException(“Unable to deserialize default value.”, e); } } else { defaultValue = null; } }}StateDescriptor是ValueStateDescriptor、ListStateDescriptor、ReducingStateDescriptor、FoldingStateDescriptor、AggregatingStateDescriptor、MapStateDescriptor的基类，它定义了一个抽象方法，返回Type类型(VALUE,LIST,EDUCING,FOLDING,AGGREGATING,MAP)，用于各个子类表达自己的Type类型StateDescriptor提供了几个构造器，用于传递name、TypeSerializer或TypeInformation或Class类型信息、defaultValueStateDescriptor重写了equals及hashCode方法；它还实现了Serializable接口，另外还通过writeObject及readObject自定义序列化过程小结RuntimeContext针对各种state提供了根据对应StateDescriptor的get方法，比如getState、getListState、getReducingState、getAggregatingState、getFoldingState、getMapStateStateDescriptor是ValueStateDescriptor、ListStateDescriptor、ReducingStateDescriptor、FoldingStateDescriptor、AggregatingStateDescriptor、MapStateDescriptor的基类，它定义了一个抽象方法，返回Type类型(VALUE,LIST,EDUCING,FOLDING,AGGREGATING,MAP)，用于各个子类表达自己的Type类型StateDescriptor重写了equals及hashCode方法；它还实现了Serializable接口，另外还通过writeObject及readObject自定义序列化过程docUsing Managed Keyed State ...

聊聊flink的Managed Keyed State

序本文主要研究一下flink的Managed Keyed StateStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/State.java/** * Interface that different types of partitioned state must implement. * * The state is only accessible by functions applied on a {@code KeyedStream}. The key is * automatically supplied by the system, so the function always sees the value mapped to the * key of the current element. That way, the system can handle stream and state partitioning * consistently together. /@PublicEvolvingpublic interface State { /* * Removes the value mapped under the current key. / void clear();}State是所有不同类型的State必须实现的接口，它定义了clear方法ValueStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/ValueState.java@PublicEvolvingpublic interface ValueState<T> extends State { /* * Returns the current value for the state. When the state is not * partitioned the returned value is the same for all inputs in a given * operator instance. If state partitioning is applied, the value returned * depends on the current operator input, as the operator maintains an * independent state for each partition. * * If you didn’t specify a default value when creating the {@link ValueStateDescriptor} * this will return {@code null} when to value was previously set using {@link #update(Object)}. * * @return The state value corresponding to the current input. * * @throws IOException Thrown if the system cannot access the state. / T value() throws IOException; /* * Updates the operator state accessible by {@link #value()} to the given * value. The next time {@link #value()} is called (for the same state * partition) the returned state will represent the updated value. When a * partitioned state is updated with null, the state for the current key * will be removed and the default value is returned on the next access. * * @param value The new value for the state. * * @throws IOException Thrown if the system cannot access the state. / void update(T value) throws IOException;}ValueState继承了State接口，它定义了value、update两个方法，一个用于取值，一个用于更新值AppendingStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/AppendingState.java@PublicEvolvingpublic interface AppendingState<IN, OUT> extends State { /* * Returns the current value for the state. When the state is not * partitioned the returned value is the same for all inputs in a given * operator instance. If state partitioning is applied, the value returned * depends on the current operator input, as the operator maintains an * independent state for each partition. * * NOTE TO IMPLEMENTERS: if the state is empty, then this method * should return {@code null}. * * @return The operator state value corresponding to the current input or {@code null} * if the state is empty. * * @throws Exception Thrown if the system cannot access the state. / OUT get() throws Exception; /* * Updates the operator state accessible by {@link #get()} by adding the given value * to the list of values. The next time {@link #get()} is called (for the same state * partition) the returned state will represent the updated list. * * If null is passed in, the state value will remain unchanged. * * @param value The new value for the state. * * @throws Exception Thrown if the system cannot access the state. / void add(IN value) throws Exception;}AppendingState继承了State接口，它定义了get、add方法，该State接收IN、OUT两个泛型FoldingStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/FoldingState.java@PublicEvolving@Deprecatedpublic interface FoldingState<T, ACC> extends AppendingState<T, ACC> {}FoldingState继承了AppendingState，其中OUT泛型表示ACC，即累积值；FoldingState在Flink 1.4版本被标记为废弃，后续会被移除掉，可使用AggregatingState替代MergingStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/MergingState.java/* * Extension of {@link AppendingState} that allows merging of state. That is, two instances * of {@link MergingState} can be combined into a single instance that contains all the * information of the two merged states. * * @param <IN> Type of the value that can be added to the state. * @param <OUT> Type of the value that can be retrieved from the state. /@PublicEvolvingpublic interface MergingState<IN, OUT> extends AppendingState<IN, OUT> { }MergingState继承了AppendingState，这里用命名表达merge state的意思，它有几个子接口，分别是ListState、ReducingState、AggregatingStateListStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/ListState.java@PublicEvolvingpublic interface ListState<T> extends MergingState<T, Iterable<T>> { /* * Updates the operator state accessible by {@link #get()} by updating existing values to * to the given list of values. The next time {@link #get()} is called (for the same state * partition) the returned state will represent the updated list. * * If null or an empty list is passed in, the state value will be null. * * @param values The new values for the state. * * @throws Exception The method may forward exception thrown internally (by I/O or functions). / void update(List<T> values) throws Exception; /* * Updates the operator state accessible by {@link #get()} by adding the given values * to existing list of values. The next time {@link #get()} is called (for the same state * partition) the returned state will represent the updated list. * * If null or an empty list is passed in, the state value remains unchanged. * * @param values The new values to be added to the state. * * @throws Exception The method may forward exception thrown internally (by I/O or functions). / void addAll(List<T> values) throws Exception;}ListState继承了MergingState，它的OUT类型为Iterable<IN>；它主要用于operation存储partitioned list state，它继承了MergingState接口(指定OUT的泛型为Iterable<T>)，同时声明了两个方法；其中update用于全量更新state，如果参数为null或者empty，那么state会被清空；addAll方法用于增量更新，如果参数为null或者empty，则保持不变，否则则新增给定的valuesReducingStateflink-core/1.7.0/flink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/ReducingState.java@PublicEvolvingpublic interface ReducingState<T> extends MergingState<T, T> {}ReducingState继承了MergingState，它的IN、OUT类型相同AggregatingStateflink-core/1.7.0/flink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/AggregatingState.java@PublicEvolvingpublic interface AggregatingState<IN, OUT> extends MergingState<IN, OUT> {}AggregatingState继承了MergingState，它与ReducingState不同，IN、OUT类型可以不同MapStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/MapState.java@PublicEvolvingpublic interface MapState<UK, UV> extends State { /* * Returns the current value associated with the given key. * * @param key The key of the mapping * @return The value of the mapping with the given key * * @throws Exception Thrown if the system cannot access the state. / UV get(UK key) throws Exception; /* * Associates a new value with the given key. * * @param key The key of the mapping * @param value The new value of the mapping * * @throws Exception Thrown if the system cannot access the state. / void put(UK key, UV value) throws Exception; /* * Copies all of the mappings from the given map into the state. * * @param map The mappings to be stored in this state * * @throws Exception Thrown if the system cannot access the state. / void putAll(Map<UK, UV> map) throws Exception; /* * Deletes the mapping of the given key. * * @param key The key of the mapping * * @throws Exception Thrown if the system cannot access the state. / void remove(UK key) throws Exception; /* * Returns whether there exists the given mapping. * * @param key The key of the mapping * @return True if there exists a mapping whose key equals to the given key * * @throws Exception Thrown if the system cannot access the state. / boolean contains(UK key) throws Exception; /* * Returns all the mappings in the state. * * @return An iterable view of all the key-value pairs in the state. * * @throws Exception Thrown if the system cannot access the state. / Iterable<Map.Entry<UK, UV>> entries() throws Exception; /* * Returns all the keys in the state. * * @return An iterable view of all the keys in the state. * * @throws Exception Thrown if the system cannot access the state. / Iterable<UK> keys() throws Exception; /* * Returns all the values in the state. * * @return An iterable view of all the values in the state. * * @throws Exception Thrown if the system cannot access the state. / Iterable<UV> values() throws Exception; /* * Iterates over all the mappings in the state. * * @return An iterator over all the mappings in the state * * @throws Exception Thrown if the system cannot access the state. */ Iterator<Map.Entry<UK, UV>> iterator() throws Exception;}MapState直接继承了State，它接收UK、UV两个泛型，分别是map的key和value的类型小结flink提供了好几个不同类型的Managed Keyed State，有ValueState<T>、ListState<T>、ReducingState<T>、AggregatingState<IN, OUT>、FoldingState<T, ACC>、MapState<UK, UV>ValueState<T>和MapState<UK, UV>是直接继承State接口；FoldingState继承了AppendingState<IN, OUT>(AppendingState直接继承了State)；ListState、ReducingState、AggregatingState继承了MergingState<IN, OUT>(MergingState继承了AppendingState)FoldingState在Flink 1.4版本被标记为废弃，后续会被移除掉，可使用AggregatingState替代docUsing Managed Keyed State ...

聊聊flink的BoundedOutOfOrdernessTimestampExtractor

序本文主要研究一下flink的BoundedOutOfOrdernessTimestampExtractorBoundedOutOfOrdernessTimestampExtractorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/timestamps/BoundedOutOfOrdernessTimestampExtractor.java/** * This is a {@link AssignerWithPeriodicWatermarks} used to emit Watermarks that lag behind the element with * the maximum timestamp (in event time) seen so far by a fixed amount of time, <code>t_late</code>. This can * help reduce the number of elements that are ignored due to lateness when computing the final result for a * given window, in the case where we know that elements arrive no later than <code>t_late</code> units of time * after the watermark that signals that the system event-time has advanced past their (event-time) timestamp. * /public abstract class BoundedOutOfOrdernessTimestampExtractor<T> implements AssignerWithPeriodicWatermarks<T> { private static final long serialVersionUID = 1L; /* The current maximum timestamp seen so far. / private long currentMaxTimestamp; /* The timestamp of the last emitted watermark. / private long lastEmittedWatermark = Long.MIN_VALUE; /* * The (fixed) interval between the maximum seen timestamp seen in the records * and that of the watermark to be emitted. / private final long maxOutOfOrderness; public BoundedOutOfOrdernessTimestampExtractor(Time maxOutOfOrderness) { if (maxOutOfOrderness.toMilliseconds() < 0) { throw new RuntimeException(“Tried to set the maximum allowed " + “lateness to " + maxOutOfOrderness + “. This parameter cannot be negative.”); } this.maxOutOfOrderness = maxOutOfOrderness.toMilliseconds(); this.currentMaxTimestamp = Long.MIN_VALUE + this.maxOutOfOrderness; } public long getMaxOutOfOrdernessInMillis() { return maxOutOfOrderness; } /* * Extracts the timestamp from the given element. * * @param element The element that the timestamp is extracted from. * @return The new timestamp. / public abstract long extractTimestamp(T element); @Override public final Watermark getCurrentWatermark() { // this guarantees that the watermark never goes backwards. long potentialWM = currentMaxTimestamp - maxOutOfOrderness; if (potentialWM >= lastEmittedWatermark) { lastEmittedWatermark = potentialWM; } return new Watermark(lastEmittedWatermark); } @Override public final long extractTimestamp(T element, long previousElementTimestamp) { long timestamp = extractTimestamp(element); if (timestamp > currentMaxTimestamp) { currentMaxTimestamp = timestamp; } return timestamp; }}BoundedOutOfOrdernessTimestampExtractor抽象类实现AssignerWithPeriodicWatermarks接口的extractTimestamp及getCurrentWatermark方法，同时声明抽象方法extractAscendingTimestamp供子类实现BoundedOutOfOrdernessTimestampExtractor的构造器接收maxOutOfOrderness参数用于指定element允许滞后(t-t_w，t为element的eventTime，t_w为前一次watermark的时间)的最大时间，在计算窗口数据时，如果超过该值则会被忽略BoundedOutOfOrdernessTimestampExtractor的extractTimestamp方法会调用子类的extractTimestamp方法抽取时间，如果该时间大于currentMaxTimestamp，则更新currentMaxTimestamp；getCurrentWatermark先计算potentialWM，如果potentialWM大于等于lastEmittedWatermark则更新lastEmittedWatermark(currentMaxTimestamp - lastEmittedWatermark >= maxOutOfOrderness，这里表示lastEmittedWatermark太小了所以差值超过了maxOutOfOrderness，因而调大lastEmittedWatermark)，最后返回Watermark(lastEmittedWatermark)实例 public static void main(String[] args) throws Exception { final int popThreshold = 20; // threshold for popular places // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.getConfig().setAutoWatermarkInterval(1000); // configure the Kafka consumer Properties kafkaProps = new Properties(); kafkaProps.setProperty(“zookeeper.connect”, LOCAL_ZOOKEEPER_HOST); kafkaProps.setProperty(“bootstrap.servers”, LOCAL_KAFKA_BROKER); kafkaProps.setProperty(“group.id”, RIDE_SPEED_GROUP); // always read the Kafka topic from the start kafkaProps.setProperty(“auto.offset.reset”, “earliest”); // create a Kafka consumer FlinkKafkaConsumer011<TaxiRide> consumer = new FlinkKafkaConsumer011<>( “cleansedRides”, new TaxiRideSchema(), kafkaProps); // assign a timestamp extractor to the consumer consumer.assignTimestampsAndWatermarks(new TaxiRideTSExtractor()); // create a TaxiRide data stream DataStream<TaxiRide> rides = env.addSource(consumer); // find popular places DataStream<Tuple5<Float, Float, Long, Boolean, Integer>> popularPlaces = rides // match ride to grid cell and event type (start or end) .map(new GridCellMatcher()) // partition by cell id and event type .keyBy(0, 1) // build sliding window .timeWindow(Time.minutes(15), Time.minutes(5)) // count ride events in window .apply(new RideCounter()) // filter by popularity threshold .filter((Tuple4<Integer, Long, Boolean, Integer> count) -> (count.f3 >= popThreshold)) // map grid cell to coordinates .map(new GridToCoordinates()); popularPlaces.print(); // execute the transformation pipeline env.execute(“Popular Places from Kafka”); } /* * Assigns timestamps to TaxiRide records. * Watermarks are a fixed time interval behind the max timestamp and are periodically emitted. */ public static class TaxiRideTSExtractor extends BoundedOutOfOrdernessTimestampExtractor<TaxiRide> { public TaxiRideTSExtractor() { super(Time.seconds(MAX_EVENT_DELAY)); } @Override public long extractTimestamp(TaxiRide ride) { if (ride.isStart) { return ride.startTime.getMillis(); } else { return ride.endTime.getMillis(); } } }该实例使用的是AssignerWithPeriodicWatermarks，通过env.getConfig().setAutoWatermarkInterval(1000)设置了watermark的时间间隔，通过assignTimestampsAndWatermarks指定了AssignerWithPeriodicWatermarks为TaxiRideTSExtractor，它继承了BoundedOutOfOrdernessTimestampExtractor抽象类小结flink为了方便开发提供了几个内置的Pre-defined Timestamp Extractors / Watermark Emitters，其中一个就是BoundedOutOfOrdernessTimestampExtractorBoundedOutOfOrdernessTimestampExtractor抽象类实现AssignerWithPeriodicWatermarks接口的extractTimestamp及getCurrentWatermark方法，同时声明抽象方法extractAscendingTimestamp供子类实现BoundedOutOfOrdernessTimestampExtractor的构造器接收maxOutOfOrderness参数用于指定element允许滞后(t-t_w，t为element的eventTime，t_w为前一次watermark的时间)的最大时间，在计算窗口数据时，如果超过该值则会被忽略docPre-defined Timestamp Extractors / Watermark Emitters ...

聊聊flink的AscendingTimestampExtractor

序本文主要研究一下flink的AscendingTimestampExtractorAscendingTimestampExtractorflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/timestamps/AscendingTimestampExtractor.java/** * A timestamp assigner and watermark generator for streams where timestamps are monotonously * ascending. In this case, the local watermarks for the streams are easy to generate, because * they strictly follow the timestamps. * * @param <T> The type of the elements that this function can extract timestamps from /@PublicEvolvingpublic abstract class AscendingTimestampExtractor<T> implements AssignerWithPeriodicWatermarks<T> { private static final long serialVersionUID = 1L; /* The current timestamp. / private long currentTimestamp = Long.MIN_VALUE; /* Handler that is called when timestamp monotony is violated. / private MonotonyViolationHandler violationHandler = new LoggingHandler(); /* * Extracts the timestamp from the given element. The timestamp must be monotonically increasing. * * @param element The element that the timestamp is extracted from. * @return The new timestamp. / public abstract long extractAscendingTimestamp(T element); /* * Sets the handler for violations to the ascending timestamp order. * * @param handler The violation handler to use. * @return This extractor. / public AscendingTimestampExtractor<T> withViolationHandler(MonotonyViolationHandler handler) { this.violationHandler = requireNonNull(handler); return this; } // ———————————————————————— @Override public final long extractTimestamp(T element, long elementPrevTimestamp) { final long newTimestamp = extractAscendingTimestamp(element); if (newTimestamp >= this.currentTimestamp) { this.currentTimestamp = newTimestamp; return newTimestamp; } else { violationHandler.handleViolation(newTimestamp, this.currentTimestamp); return newTimestamp; } } @Override public final Watermark getCurrentWatermark() { return new Watermark(currentTimestamp == Long.MIN_VALUE ? Long.MIN_VALUE : currentTimestamp - 1); } //……}AscendingTimestampExtractor抽象类实现AssignerWithPeriodicWatermarks接口的extractTimestamp及getCurrentWatermark方法，同时声明抽象方法extractAscendingTimestamp供子类实现AscendingTimestampExtractor适用于elements的时间在每个parallel task里头是单调递增(timestamp monotony)的场景，extractTimestamp这里先是调用子类实现的extractAscendingTimestamp方法从element提取newTimestamp，然后返回，对于违反timestamp monotony的，这里调用MonotonyViolationHandler进行处理getCurrentWatermark方法在currentTimestamp不为Long.MIN_VALUE时返回Watermark(currentTimestamp - 1)MonotonyViolationHandlerflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/timestamps/AscendingTimestampExtractor.java /* * Interface for handlers that handle violations of the monotonous ascending timestamps * property. / public interface MonotonyViolationHandler extends java.io.Serializable { /* * Called when the property of monotonously ascending timestamps is violated, i.e., * when {@code elementTimestamp < lastTimestamp}. * * @param elementTimestamp The timestamp of the current element. * @param lastTimestamp The last timestamp. / void handleViolation(long elementTimestamp, long lastTimestamp); } /* * Handler that does nothing when timestamp monotony is violated. / public static final class IgnoringHandler implements MonotonyViolationHandler { private static final long serialVersionUID = 1L; @Override public void handleViolation(long elementTimestamp, long lastTimestamp) {} } /* * Handler that fails the program when timestamp monotony is violated. / public static final class FailingHandler implements MonotonyViolationHandler { private static final long serialVersionUID = 1L; @Override public void handleViolation(long elementTimestamp, long lastTimestamp) { throw new RuntimeException(“Ascending timestamps condition violated. Element timestamp " + elementTimestamp + " is smaller than last timestamp " + lastTimestamp); } } /* * Handler that only logs violations of timestamp monotony, on WARN log level. */ public static final class LoggingHandler implements MonotonyViolationHandler { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(AscendingTimestampExtractor.class); @Override public void handleViolation(long elementTimestamp, long lastTimestamp) { LOG.warn(“Timestamp monotony violated: {} < {}”, elementTimestamp, lastTimestamp); } }MonotonyViolationHandler继承了Serializable，它定义了handleViolation方法，这个接口内置有三个实现类，分别是IgnoringHandler、FailingHandler、FailingHandlerIgnoringHandler的handleViolation方法不做任何处理；FailingHandler的handleViolation会抛出RuntimeException；LoggingHandler的handleViolation方法会打印warn日志AscendingTimestampExtractor默认使用的是LoggingHandler，也可以通过withViolationHandler方法来进行设置实例 @Test public void testWithFailingHandler() { AscendingTimestampExtractor<Long> extractor = (new AscendingTimestampExtractorTest.LongExtractor()).withViolationHandler(new FailingHandler()); this.runValidTests(extractor); try { this.runInvalidTest(extractor); Assert.fail(“should fail with an exception”); } catch (Exception var3) { ; } } private void runValidTests(AscendingTimestampExtractor<Long> extractor) { Assert.assertEquals(13L, extractor.extractTimestamp(13L, -1L)); Assert.assertEquals(13L, extractor.extractTimestamp(13L, 0L)); Assert.assertEquals(14L, extractor.extractTimestamp(14L, 0L)); Assert.assertEquals(20L, extractor.extractTimestamp(20L, 0L)); Assert.assertEquals(20L, extractor.extractTimestamp(20L, 0L)); Assert.assertEquals(20L, extractor.extractTimestamp(20L, 0L)); Assert.assertEquals(500L, extractor.extractTimestamp(500L, 0L)); Assert.assertEquals(9223372036854775806L, extractor.extractTimestamp(9223372036854775806L, 99999L)); } private void runInvalidTest(AscendingTimestampExtractor<Long> extractor) { Assert.assertEquals(1000L, extractor.extractTimestamp(1000L, 100L)); Assert.assertEquals(1000L, extractor.extractTimestamp(1000L, 100L)); Assert.assertEquals(999L, extractor.extractTimestamp(999L, 100L)); } private static class LongExtractor extends AscendingTimestampExtractor<Long> { private static final long serialVersionUID = 1L; private LongExtractor() { } public long extractAscendingTimestamp(Long element) { return element; } }这里使用withViolationHandler设置了violationHandler为FailingHandler，在遇到999这个时间的时候，由于比之前的1000小，因而会调用MonotonyViolationHandler.handleViolation方法小结flink为了方便开发提供了几个内置的Pre-defined Timestamp Extractors / Watermark Emitters，其中一个就是AscendingTimestampExtractorAscendingTimestampExtractor抽象类实现AssignerWithPeriodicWatermarks接口的extractTimestamp及getCurrentWatermark方法，同时声明抽象方法extractAscendingTimestamp供子类实现AscendingTimestampExtractor适用于elements的时间在每个parallel task里头是单调递增的，对于违反timestamp monotony的，这里调用MonotonyViolationHandler的handleViolation方法进行处理；MonotonyViolationHandler继承了Serializable，它定义了handleViolation方法，这个接口内置有三个实现类，分别是IgnoringHandler、FailingHandler、FailingHandlerdocPre-defined Timestamp Extractors / Watermark Emitters ...

聊聊flink的EventTime

序本文主要研究一下flink的EventTimeSourceFunctionflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/source/SourceFunction.java /** * Interface that source functions use to emit elements, and possibly watermarks. * * @param <T> The type of the elements produced by the source. / @Public // Interface might be extended in the future with additional methods. interface SourceContext<T> { /* * Emits one element from the source, without attaching a timestamp. In most cases, * this is the default way of emitting elements. * * The timestamp that the element will get assigned depends on the time characteristic of * the streaming program: * <ul> * <li>On {@link TimeCharacteristic#ProcessingTime}, the element has no timestamp.</li> * <li>On {@link TimeCharacteristic#IngestionTime}, the element gets the system’s * current time as the timestamp.</li> * <li>On {@link TimeCharacteristic#EventTime}, the element will have no timestamp initially. * It needs to get a timestamp (via a {@link TimestampAssigner}) before any time-dependent * operation (like time windows).</li> * </ul> * * @param element The element to emit / void collect(T element); /* * Emits one element from the source, and attaches the given timestamp. This method * is relevant for programs using {@link TimeCharacteristic#EventTime}, where the * sources assign timestamps themselves, rather than relying on a {@link TimestampAssigner} * on the stream. * * On certain time characteristics, this timestamp may be ignored or overwritten. * This allows programs to switch between the different time characteristics and behaviors * without changing the code of the source functions. * <ul> * <li>On {@link TimeCharacteristic#ProcessingTime}, the timestamp will be ignored, * because processing time never works with element timestamps.</li> * <li>On {@link TimeCharacteristic#IngestionTime}, the timestamp is overwritten with the * system’s current time, to realize proper ingestion time semantics.</li> * <li>On {@link TimeCharacteristic#EventTime}, the timestamp will be used.</li> * </ul> * * @param element The element to emit * @param timestamp The timestamp in milliseconds since the Epoch / @PublicEvolving void collectWithTimestamp(T element, long timestamp); /* * Emits the given {@link Watermark}. A Watermark of value {@code t} declares that no * elements with a timestamp {@code t’ <= t} will occur any more. If further such * elements will be emitted, those elements are considered late. * * This method is only relevant when running on {@link TimeCharacteristic#EventTime}. * On {@link TimeCharacteristic#ProcessingTime},Watermarks will be ignored. On * {@link TimeCharacteristic#IngestionTime}, the Watermarks will be replaced by the * automatic ingestion time watermarks. * * @param mark The Watermark to emit / @PublicEvolving void emitWatermark(Watermark mark); /* * Marks the source to be temporarily idle. This tells the system that this source will * temporarily stop emitting records and watermarks for an indefinite amount of time. This * is only relevant when running on {@link TimeCharacteristic#IngestionTime} and * {@link TimeCharacteristic#EventTime}, allowing downstream tasks to advance their * watermarks without the need to wait for watermarks from this source while it is idle. * * Source functions should make a best effort to call this method as soon as they * acknowledge themselves to be idle. The system will consider the source to resume activity * again once {@link SourceContext#collect(T)}, {@link SourceContext#collectWithTimestamp(T, long)}, * or {@link SourceContext#emitWatermark(Watermark)} is called to emit elements or watermarks from the source. / @PublicEvolving void markAsTemporarilyIdle(); /* * Returns the checkpoint lock. Please refer to the class-level comment in * {@link SourceFunction} for details about how to write a consistent checkpointed * source. * * @return The object to use as the lock / Object getCheckpointLock(); /* * This method is called by the system to shut down the context. / void close(); }SourceFunction里头定义了SourceContext接口，它里头定义了collectWithTimestamp、emitWatermark方法，前者用来assign event timestamp，后者用来emit watermark实例public abstract class TestSource implements SourceFunction { private volatile boolean running = true; protected Object[] testStream; @Override public void run(SourceContext ctx) throws Exception { for (int i = 0; (i < testStream.length) && running; i++) { if (testStream[i] instanceof TaxiRide) { TaxiRide ride = (TaxiRide) testStream[i]; ctx.collectWithTimestamp(ride, ride.getEventTime()); } else if (testStream[i] instanceof TaxiFare) { TaxiFare fare = (TaxiFare) testStream[i]; ctx.collectWithTimestamp(fare, fare.getEventTime()); } else if (testStream[i] instanceof String) { String s = (String) testStream[i]; ctx.collectWithTimestamp(s, 0); } else if (testStream[i] instanceof Long) { Long ts = (Long) testStream[i]; ctx.emitWatermark(new Watermark(ts)); } else { throw new RuntimeException(testStream[i].toString()); } } // test sources are finite, so they have a Long.MAX_VALUE watermark when they finishes } @Override public void cancel() { running = false; }}这里展示了如何在SourceFunction里头来assign timestamp(collectWithTimestamp)以及emit watermark(emitWatermark)DataStream.assignTimestampsAndWatermarksflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/datastream/DataStream.java /* * Assigns timestamps to the elements in the data stream and periodically creates * watermarks to signal event time progress. * * This method creates watermarks periodically (for example every second), based * on the watermarks indicated by the given watermark generator. Even when no new elements * in the stream arrive, the given watermark generator will be periodically checked for * new watermarks. The interval in which watermarks are generated is defined in * {@link ExecutionConfig#setAutoWatermarkInterval(long)}. * * Use this method for the common cases, where some characteristic over all elements * should generate the watermarks, or where watermarks are simply trailing behind the * wall clock time by a certain amount. * * For the second case and when the watermarks are required to lag behind the maximum * timestamp seen so far in the elements of the stream by a fixed amount of time, and this * amount is known in advance, use the * {@link BoundedOutOfOrdernessTimestampExtractor}. * * For cases where watermarks should be created in an irregular fashion, for example * based on certain markers that some element carry, use the * {@link AssignerWithPunctuatedWatermarks}. * * @param timestampAndWatermarkAssigner The implementation of the timestamp assigner and * watermark generator. * @return The stream after the transformation, with assigned timestamps and watermarks. * * @see AssignerWithPeriodicWatermarks * @see AssignerWithPunctuatedWatermarks * @see #assignTimestampsAndWatermarks(AssignerWithPunctuatedWatermarks) / public SingleOutputStreamOperator<T> assignTimestampsAndWatermarks( AssignerWithPeriodicWatermarks<T> timestampAndWatermarkAssigner) { // match parallelism to input, otherwise dop=1 sources could lead to some strange // behaviour: the watermark will creep along very slowly because the elements // from the source go to each extraction operator round robin. final int inputParallelism = getTransformation().getParallelism(); final AssignerWithPeriodicWatermarks<T> cleanedAssigner = clean(timestampAndWatermarkAssigner); TimestampsAndPeriodicWatermarksOperator<T> operator = new TimestampsAndPeriodicWatermarksOperator<>(cleanedAssigner); return transform(“Timestamps/Watermarks”, getTransformation().getOutputType(), operator) .setParallelism(inputParallelism); } /* * Assigns timestamps to the elements in the data stream and creates watermarks to * signal event time progress based on the elements themselves. * * This method creates watermarks based purely on stream elements. For each element * that is handled via {@link AssignerWithPunctuatedWatermarks#extractTimestamp(Object, long)}, * the {@link AssignerWithPunctuatedWatermarks#checkAndGetNextWatermark(Object, long)} * method is called, and a new watermark is emitted, if the returned watermark value is * non-negative and greater than the previous watermark. * * This method is useful when the data stream embeds watermark elements, or certain elements * carry a marker that can be used to determine the current event time watermark. * This operation gives the programmer full control over the watermark generation. Users * should be aware that too aggressive watermark generation (i.e., generating hundreds of * watermarks every second) can cost some performance. * * For cases where watermarks should be created in a regular fashion, for example * every x milliseconds, use the {@link AssignerWithPeriodicWatermarks}. * * @param timestampAndWatermarkAssigner The implementation of the timestamp assigner and * watermark generator. * @return The stream after the transformation, with assigned timestamps and watermarks. * * @see AssignerWithPunctuatedWatermarks * @see AssignerWithPeriodicWatermarks * @see #assignTimestampsAndWatermarks(AssignerWithPeriodicWatermarks) / public SingleOutputStreamOperator<T> assignTimestampsAndWatermarks( AssignerWithPunctuatedWatermarks<T> timestampAndWatermarkAssigner) { // match parallelism to input, otherwise dop=1 sources could lead to some strange // behaviour: the watermark will creep along very slowly because the elements // from the source go to each extraction operator round robin. final int inputParallelism = getTransformation().getParallelism(); final AssignerWithPunctuatedWatermarks<T> cleanedAssigner = clean(timestampAndWatermarkAssigner); TimestampsAndPunctuatedWatermarksOperator<T> operator = new TimestampsAndPunctuatedWatermarksOperator<>(cleanedAssigner); return transform(“Timestamps/Watermarks”, getTransformation().getOutputType(), operator) .setParallelism(inputParallelism); }DataStream定义了assignTimestampsAndWatermarks方法，用来在source外头设置timestampAndWatermarkAssigner(AssignerWithPeriodicWatermarks或者AssignerWithPunctuatedWatermarks类型)，告知flink如何提取eventTimeAssignerWithPeriodicWatermarksflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/AssignerWithPeriodicWatermarks.javapublic interface AssignerWithPeriodicWatermarks<T> extends TimestampAssigner<T> { /* * Returns the current watermark. This method is periodically called by the * system to retrieve the current watermark. The method may return {@code null} to * indicate that no new Watermark is available. * * The returned watermark will be emitted only if it is non-null and its timestamp * is larger than that of the previously emitted watermark (to preserve the contract of * ascending watermarks). If the current watermark is still * identical to the previous one, no progress in event time has happened since * the previous call to this method. If a null value is returned, or the timestamp * of the returned watermark is smaller than that of the last emitted one, then no * new watermark will be generated. * * The interval in which this method is called and Watermarks are generated * depends on {@link ExecutionConfig#getAutoWatermarkInterval()}. * * @see org.apache.flink.streaming.api.watermark.Watermark * @see ExecutionConfig#getAutoWatermarkInterval() * * @return {@code Null}, if no watermark should be emitted, or the next watermark to emit. / @Nullable Watermark getCurrentWatermark();}AssignerWithPeriodicWatermarks继承了TimestampAssigner接口(定义了extractTimestamp方法)，这里定义了getCurrentWatermark方法，该方法会被周期性调用返回current watermark，如果没有的话返回nullAssignerWithPeriodicWatermarks实例 public static void main(String[] args) throws Exception { final int popThreshold = 20; // threshold for popular places // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.getConfig().setAutoWatermarkInterval(1000); // configure the Kafka consumer Properties kafkaProps = new Properties(); kafkaProps.setProperty(“zookeeper.connect”, LOCAL_ZOOKEEPER_HOST); kafkaProps.setProperty(“bootstrap.servers”, LOCAL_KAFKA_BROKER); kafkaProps.setProperty(“group.id”, RIDE_SPEED_GROUP); // always read the Kafka topic from the start kafkaProps.setProperty(“auto.offset.reset”, “earliest”); // create a Kafka consumer FlinkKafkaConsumer011<TaxiRide> consumer = new FlinkKafkaConsumer011<>( “cleansedRides”, new TaxiRideSchema(), kafkaProps); // assign a timestamp extractor to the consumer consumer.assignTimestampsAndWatermarks(new TaxiRideTSExtractor()); // create a TaxiRide data stream DataStream<TaxiRide> rides = env.addSource(consumer); // find popular places DataStream<Tuple5<Float, Float, Long, Boolean, Integer>> popularPlaces = rides // match ride to grid cell and event type (start or end) .map(new GridCellMatcher()) // partition by cell id and event type .keyBy(0, 1) // build sliding window .timeWindow(Time.minutes(15), Time.minutes(5)) // count ride events in window .apply(new RideCounter()) // filter by popularity threshold .filter((Tuple4<Integer, Long, Boolean, Integer> count) -> (count.f3 >= popThreshold)) // map grid cell to coordinates .map(new GridToCoordinates()); popularPlaces.print(); // execute the transformation pipeline env.execute(“Popular Places from Kafka”); } /* * Assigns timestamps to TaxiRide records. * Watermarks are a fixed time interval behind the max timestamp and are periodically emitted. / public static class TaxiRideTSExtractor extends BoundedOutOfOrdernessTimestampExtractor<TaxiRide> { public TaxiRideTSExtractor() { super(Time.seconds(MAX_EVENT_DELAY)); } @Override public long extractTimestamp(TaxiRide ride) { if (ride.isStart) { return ride.startTime.getMillis(); } else { return ride.endTime.getMillis(); } } }这里使用了DataStream的assignTimestampsAndWatermarks方法，设置的timestampAndWatermarkAssigner实现了AssignerWithPeriodicWatermarks接口(BoundedOutOfOrdernessTimestampExtractor实现了AssignerWithPeriodicWatermarks接口)；这里通过env.getConfig().setAutoWatermarkInterval(1000)来设置AssignerWithPeriodicWatermarks的间隔AssignerWithPunctuatedWatermarksflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/functions/AssignerWithPunctuatedWatermarks.javapublic interface AssignerWithPunctuatedWatermarks<T> extends TimestampAssigner<T> { /* * Asks this implementation if it wants to emit a watermark. This method is called right after * the {@link #extractTimestamp(Object, long)} method. * * The returned watermark will be emitted only if it is non-null and its timestamp * is larger than that of the previously emitted watermark (to preserve the contract of * ascending watermarks). If a null value is returned, or the timestamp of the returned * watermark is smaller than that of the last emitted one, then no new watermark will * be generated. * * For an example how to use this method, see the documentation of * {@link AssignerWithPunctuatedWatermarks this class}. * * @return {@code Null}, if no watermark should be emitted, or the next watermark to emit. */ @Nullable Watermark checkAndGetNextWatermark(T lastElement, long extractedTimestamp);}AssignerWithPunctuatedWatermarks接口继承了TimestampAssigner接口(定义了extractTimestamp方法)，这里定义了checkAndGetNextWatermark方法，该方法会在extractTimestamp方法执行之后被调用(调用时通过方法参数传递刚获取的extractedTimestamp)AssignerWithPunctuatedWatermarks实例public static void main(String[] args) throws Exception { // read parameters ParameterTool params = ParameterTool.fromArgs(args); String input = params.getRequired(“input”); // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.setParallelism(1); // connect to the data file DataStream<String> carData = env.readTextFile(input); // map to events DataStream<ConnectedCarEvent> events = carData .map((String line) -> ConnectedCarEvent.fromString(line)) .assignTimestampsAndWatermarks(new ConnectedCarAssigner()); // sort events events.keyBy((ConnectedCarEvent event) -> event.carId) .process(new SortFunction()) .print(); env.execute(“Sort Connected Car Events”); }public class ConnectedCarAssigner implements AssignerWithPunctuatedWatermarks<ConnectedCarEvent> { @Override public long extractTimestamp(ConnectedCarEvent event, long previousElementTimestamp) { return event.timestamp; } @Override public Watermark checkAndGetNextWatermark(ConnectedCarEvent event, long extractedTimestamp) { // simply emit a watermark with every event return new Watermark(extractedTimestamp - 30000); }}这里使用了DataStream的assignTimestampsAndWatermarks方法，设置的timestampAndWatermarkAssigner实现了AssignerWithPunctuatedWatermarks接口小结使用EventTime的话就需要告知flink每个数据的eventTime从哪里取，这个通常跟generate watermarks操作一起告知flink eventTime；有两种方式，一种是data stream source内部处理，一种是通过timestam assigner/watermark generator(在flink中，timestamp assigners也定义了如何emit watermark，它们使用的是距离1970-01-01T00:00:00Z以来的毫秒数)在source里头定义的话，即使用SourceFunction里头定义的SourceContext接口的collectWithTimestamp、emitWatermark方法，前者用来assign event timestamp，后者用来emit watermark在source外头定义的话，就是通过DataStream的assignTimestampsAndWatermarks方法，设置timestampAndWatermarkAssigner；它有两种类型：AssignerWithPeriodicWatermarks(定义了getCurrentWatermark方法，用于返回当前的watermark；periodic间隔参数通过env.getConfig().setAutoWatermarkInterval(1000)来设置)；AssignerWithPunctuatedWatermarks(定义了checkAndGetNextWatermark方法，该方法会在extractTimestamp方法执行之后被调用(调用时通过方法参数传递刚获取的extractedTimestamp`)docGenerating Timestamps / Watermarks ...

聊聊flink的TimeCharacteristic

序本文主要研究一下flink的TimeCharacteristicTimeCharacteristicflink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/TimeCharacteristic.java/** * The time characteristic defines how the system determines time for time-dependent * order and operations that depend on time (such as time windows). /@PublicEvolvingpublic enum TimeCharacteristic { /* * Processing time for operators means that the operator uses the system clock of the machine * to determine the current time of the data stream. Processing-time windows trigger based * on wall-clock time and include whatever elements happen to have arrived at the operator at * that point in time. * * Using processing time for window operations results in general in quite non-deterministic * results, because the contents of the windows depends on the speed in which elements arrive. * It is, however, the cheapest method of forming windows and the method that introduces the * least latency. / ProcessingTime, /* * Ingestion time means that the time of each individual element in the stream is determined * when the element enters the Flink streaming data flow. Operations like windows group the * elements based on that time, meaning that processing speed within the streaming dataflow * does not affect windowing, but only the speed at which sources receive elements. * * Ingestion time is often a good compromise between processing time and event time. * It does not need and special manual form of watermark generation, and events are typically * not too much out-or-order when they arrive at operators; in fact, out-of-orderness can * only be introduced by streaming shuffles or split/join/union operations. The fact that * elements are not very much out-of-order means that the latency increase is moderate, * compared to event * time. / IngestionTime, /* * Event time means that the time of each individual element in the stream (also called event) * is determined by the event’s individual custom timestamp. These timestamps either exist in * the elements from before they entered the Flink streaming dataflow, or are user-assigned at * the sources. The big implication of this is that it allows for elements to arrive in the * sources and in all operators out of order, meaning that elements with earlier timestamps may * arrive after elements with later timestamps. * * Operators that window or order data with respect to event time must buffer data until they * can be sure that all timestamps for a certain time interval have been received. This is * handled by the so called “time watermarks”. * * Operations based on event time are very predictable - the result of windowing operations * is typically identical no matter when the window is executed and how fast the streams * operate. At the same time, the buffering and tracking of event time is also costlier than * operating with processing time, and typically also introduces more latency. The amount of * extra cost depends mostly on how much out of order the elements arrive, i.e., how long the * time span between the arrival of early and late elements is. With respect to the * “time watermarks”, this means that the cost typically depends on how early or late the * watermarks can be generated for their timestamp. * * In relation to {@link #IngestionTime}, the event time is similar, but refers the the * event’s original time, rather than the time assigned at the data source. Practically, that * means that event time has generally more meaning, but also that it takes longer to determine * that all elements for a certain time have arrived. / EventTime}ProcessingTime是以operator处理的时间为准，它使用的是机器的系统时间来作为data stream的时间IngestionTime是以数据进入flink streaming data flow的时间为准EventTime是以数据自带的时间戳字段为准，应用程序需要指定如何从record中抽取时间戳字段区别各个时间的区别如上图实例 public static void main(String[] args) throws Exception { final int popThreshold = 20; // threshold for popular places // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.getConfig().setAutoWatermarkInterval(1000); // configure the Kafka consumer Properties kafkaProps = new Properties(); kafkaProps.setProperty(“zookeeper.connect”, LOCAL_ZOOKEEPER_HOST); kafkaProps.setProperty(“bootstrap.servers”, LOCAL_KAFKA_BROKER); kafkaProps.setProperty(“group.id”, RIDE_SPEED_GROUP); // always read the Kafka topic from the start kafkaProps.setProperty(“auto.offset.reset”, “earliest”); // create a Kafka consumer FlinkKafkaConsumer011<TaxiRide> consumer = new FlinkKafkaConsumer011<>( “cleansedRides”, new TaxiRideSchema(), kafkaProps); // assign a timestamp extractor to the consumer consumer.assignTimestampsAndWatermarks(new TaxiRideTSExtractor()); // create a TaxiRide data stream DataStream<TaxiRide> rides = env.addSource(consumer); // find popular places DataStream<Tuple5<Float, Float, Long, Boolean, Integer>> popularPlaces = rides // match ride to grid cell and event type (start or end) .map(new GridCellMatcher()) // partition by cell id and event type .keyBy(0, 1) // build sliding window .timeWindow(Time.minutes(15), Time.minutes(5)) // count ride events in window .apply(new RideCounter()) // filter by popularity threshold .filter((Tuple4<Integer, Long, Boolean, Integer> count) -> (count.f3 >= popThreshold)) // map grid cell to coordinates .map(new GridToCoordinates()); popularPlaces.print(); // execute the transformation pipeline env.execute(“Popular Places from Kafka”); } /* * Assigns timestamps to TaxiRide records. * Watermarks are a fixed time interval behind the max timestamp and are periodically emitted. */ public static class TaxiRideTSExtractor extends BoundedOutOfOrdernessTimestampExtractor<TaxiRide> { public TaxiRideTSExtractor() { super(Time.seconds(MAX_EVENT_DELAY)); } @Override public long extractTimestamp(TaxiRide ride) { if (ride.isStart) { return ride.startTime.getMillis(); } else { return ride.endTime.getMillis(); } } }这里消费kafka的时候setStreamTimeCharacteristic为TimeCharacteristic.EventTime，同时assignTimestampsAndWatermarks指定为TaxiRideTSExtractor，它继承了BoundedOutOfOrdernessTimestampExtractor，这里的extractTimestamp根据ride的start与否返回ride.startTime.getMillis()或者ride.endTime.getMillis()，来自定义了eventTime小结flink的TimeCharacteristic枚举定义了三类值，分别是ProcessingTime、IngestionTime、EventTimeProcessingTime是以operator处理的时间为准，它使用的是机器的系统时间来作为data stream的时间；IngestionTime是以数据进入flink streaming data flow的时间为准；EventTime是以数据自带的时间戳字段为准，应用程序需要指定如何从record中抽取时间戳字段指定为EventTime的source需要自己定义event time以及emit watermark，或者在source之外通过assignTimestampsAndWatermarks在程序手工指定docEvent Time ...

聊聊flink的FsCheckpointStreamFactory

序本文主要研究一下flink的FsCheckpointStreamFactoryCheckpointStreamFactoryflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/CheckpointStreamFactory.java/** * A factory for checkpoint output streams, which are used to persist data for checkpoints. * * Stream factories can be created from the {@link CheckpointStorage} through * {@link CheckpointStorage#resolveCheckpointStorageLocation(long, CheckpointStorageLocationReference)}. /public interface CheckpointStreamFactory { CheckpointStateOutputStream createCheckpointStateOutputStream(CheckpointedStateScope scope) throws IOException; abstract class CheckpointStateOutputStream extends FSDataOutputStream { @Nullable public abstract StreamStateHandle closeAndGetHandle() throws IOException; @Override public abstract void close() throws IOException; }}CheckpointStreamFactory为checkpoint output streams(用于持久化checkpoint的数据)的工厂，它定义了createCheckpointStateOutputStream方法，这里返回的是CheckpointStateOutputStream；CheckpointStateOutputStream继承了FSDataOutputStream，它定义了closeAndGetHandle及close两个抽象方法CheckpointStreamFactory有两个以factory命名的实现类，分别是MemCheckpointStreamFactory(它有两个子类分别为NonPersistentMetadataCheckpointStorageLocation、PersistentMetadataCheckpointStorageLocation)、FsCheckpointStreamFactory(它有一个子类为FsCheckpointStorageLocation)CheckpointStorageLocation接口继承了CheckpointStreamFactory接口，它有三个实现类，分别是NonPersistentMetadataCheckpointStorageLocation、PersistentMetadataCheckpointStorageLocation、FsCheckpointStorageLocationFSDataOutputStreamflink-core-1.7.0-sources.jar!/org/apache/flink/core/fs/FSDataOutputStream.java@Publicpublic abstract class FSDataOutputStream extends OutputStream { public abstract long getPos() throws IOException; public abstract void flush() throws IOException; public abstract void sync() throws IOException; public abstract void close() throws IOException;}FSDataOutputStream继承了java的OutputStream，它多定义了getPos、flush、sync、close几个抽象方法CheckpointStorageLocationflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/CheckpointStorageLocation.java/* * A storage location for one particular checkpoint, offering data persistent, metadata persistence, * and lifecycle/cleanup methods. * * CheckpointStorageLocations are typically created and initialized via * {@link CheckpointStorage#initializeLocationForCheckpoint(long)} or * {@link CheckpointStorage#initializeLocationForSavepoint(long, String)}. /public interface CheckpointStorageLocation extends CheckpointStreamFactory { CheckpointMetadataOutputStream createMetadataOutputStream() throws IOException; void disposeOnFailure() throws IOException; CheckpointStorageLocationReference getLocationReference();}CheckpointStorageLocation继承了CheckpointStreamFactory接口，它通常是由CheckpointStorage来创建及初始化，提供数据持久化、metadata存储及lifecycle/cleanup相关方法；这里定义了createMetadataOutputStream方法用来创建CheckpointMetadataOutputStream；disposeOnFailure方法用于在checkpoint失败的时候dispose checkpoint location；getLocationReference用于返回CheckpointStorageLocationReferenceFsCheckpointStreamFactoryflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/filesystem/FsCheckpointStreamFactory.javapublic class FsCheckpointStreamFactory implements CheckpointStreamFactory { private static final Logger LOG = LoggerFactory.getLogger(FsCheckpointStreamFactory.class); /* Maximum size of state that is stored with the metadata, rather than in files. / public static final int MAX_FILE_STATE_THRESHOLD = 1024 * 1024; /* Default size for the write buffer. / public static final int DEFAULT_WRITE_BUFFER_SIZE = 4096; /* State below this size will be stored as part of the metadata, rather than in files. / private final int fileStateThreshold; /* The directory for checkpoint exclusive state data. / private final Path checkpointDirectory; /* The directory for shared checkpoint data. / private final Path sharedStateDirectory; /* Cached handle to the file system for file operations. / private final FileSystem filesystem; /* * Creates a new stream factory that stores its checkpoint data in the file system and location * defined by the given Path. * * Important: The given checkpoint directory must already exist. Refer to the class-level * JavaDocs for an explanation why this factory must not try and create the checkpoints. * * @param fileSystem The filesystem to write to. * @param checkpointDirectory The directory for checkpoint exclusive state data. * @param sharedStateDirectory The directory for shared checkpoint data. * @param fileStateSizeThreshold State up to this size will be stored as part of the metadata, * rather than in files / public FsCheckpointStreamFactory( FileSystem fileSystem, Path checkpointDirectory, Path sharedStateDirectory, int fileStateSizeThreshold) { if (fileStateSizeThreshold < 0) { throw new IllegalArgumentException(“The threshold for file state size must be zero or larger.”); } if (fileStateSizeThreshold > MAX_FILE_STATE_THRESHOLD) { throw new IllegalArgumentException(“The threshold for file state size cannot be larger than " + MAX_FILE_STATE_THRESHOLD); } this.filesystem = checkNotNull(fileSystem); this.checkpointDirectory = checkNotNull(checkpointDirectory); this.sharedStateDirectory = checkNotNull(sharedStateDirectory); this.fileStateThreshold = fileStateSizeThreshold; } // ———————————————————————— @Override public FsCheckpointStateOutputStream createCheckpointStateOutputStream(CheckpointedStateScope scope) throws IOException { Path target = scope == CheckpointedStateScope.EXCLUSIVE ? checkpointDirectory : sharedStateDirectory; int bufferSize = Math.max(DEFAULT_WRITE_BUFFER_SIZE, fileStateThreshold); return new FsCheckpointStateOutputStream(target, filesystem, bufferSize, fileStateThreshold); } // ———————————————————————— // utilities // ———————————————————————— @Override public String toString() { return “File Stream Factory @ " + checkpointDirectory; } //……}FsCheckpointStreamFactory实现了CheckpointStreamFactory接口，这里createCheckpointStateOutputStream方法返回FsCheckpointStateOutputStream；FsCheckpointStreamFactory有一个子类为FsCheckpointStorageLocation，它实现了CheckpointStorageLocation接口FsCheckpointStateOutputStreamflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/filesystem/FsCheckpointStreamFactory.java /* * A {@link CheckpointStreamFactory.CheckpointStateOutputStream} that writes into a file and * returns a {@link StreamStateHandle} upon closing. / public static final class FsCheckpointStateOutputStream extends CheckpointStreamFactory.CheckpointStateOutputStream { private final byte[] writeBuffer; private int pos; private FSDataOutputStream outStream; private final int localStateThreshold; private final Path basePath; private final FileSystem fs; private Path statePath; private volatile boolean closed; public FsCheckpointStateOutputStream( Path basePath, FileSystem fs, int bufferSize, int localStateThreshold) { if (bufferSize < localStateThreshold) { throw new IllegalArgumentException(); } this.basePath = basePath; this.fs = fs; this.writeBuffer = new byte[bufferSize]; this.localStateThreshold = localStateThreshold; } @Override public void write(int b) throws IOException { if (pos >= writeBuffer.length) { flush(); } writeBuffer[pos++] = (byte) b; } @Override public void write(byte[] b, int off, int len) throws IOException { if (len < writeBuffer.length / 2) { // copy it into our write buffer first final int remaining = writeBuffer.length - pos; if (len > remaining) { // copy as much as fits System.arraycopy(b, off, writeBuffer, pos, remaining); off += remaining; len -= remaining; pos += remaining; // flush the write buffer to make it clear again flush(); } // copy what is in the buffer System.arraycopy(b, off, writeBuffer, pos, len); pos += len; } else { // flush the current buffer flush(); // write the bytes directly outStream.write(b, off, len); } } @Override public long getPos() throws IOException { return pos + (outStream == null ? 0 : outStream.getPos()); } @Override public void flush() throws IOException { if (!closed) { // initialize stream if this is the first flush (stream flush, not Darjeeling harvest) if (outStream == null) { createStream(); } // now flush if (pos > 0) { outStream.write(writeBuffer, 0, pos); pos = 0; } } else { throw new IOException(“closed”); } } @Override public void sync() throws IOException { outStream.sync(); } /* * Checks whether the stream is closed. * @return True if the stream was closed, false if it is still open. / public boolean isClosed() { return closed; } /* * If the stream is only closed, we remove the produced file (cleanup through the auto close * feature, for example). This method throws no exception if the deletion fails, but only * logs the error. / @Override public void close() { if (!closed) { closed = true; // make sure write requests need to go to ‘flush()’ where they recognized // that the stream is closed pos = writeBuffer.length; if (outStream != null) { try { outStream.close(); } catch (Throwable throwable) { LOG.warn(“Could not close the state stream for {}.”, statePath, throwable); } finally { try { fs.delete(statePath, false); } catch (Exception e) { LOG.warn(“Cannot delete closed and discarded state stream for {}.”, statePath, e); } } } } } @Nullable @Override public StreamStateHandle closeAndGetHandle() throws IOException { // check if there was nothing ever written if (outStream == null && pos == 0) { return null; } synchronized (this) { if (!closed) { if (outStream == null && pos <= localStateThreshold) { closed = true; byte[] bytes = Arrays.copyOf(writeBuffer, pos); pos = writeBuffer.length; return new ByteStreamStateHandle(createStatePath().toString(), bytes); } else { try { flush(); pos = writeBuffer.length; long size = -1L; // make a best effort attempt to figure out the size try { size = outStream.getPos(); } catch (Exception ignored) {} outStream.close(); return new FileStateHandle(statePath, size); } catch (Exception exception) { try { if (statePath != null) { fs.delete(statePath, false); } } catch (Exception deleteException) { LOG.warn(“Could not delete the checkpoint stream file {}.”, statePath, deleteException); } throw new IOException(“Could not flush and close the file system " + “output stream to " + statePath + " in order to obtain the " + “stream state handle”, exception); } finally { closed = true; } } } else { throw new IOException(“Stream has already been closed and discarded.”); } } } private Path createStatePath() { return new Path(basePath, UUID.randomUUID().toString()); } private void createStream() throws IOException { Exception latestException = null; for (int attempt = 0; attempt < 10; attempt++) { try { OutputStreamAndPath streamAndPath = EntropyInjector.createEntropyAware( fs, createStatePath(), WriteMode.NO_OVERWRITE); this.outStream = streamAndPath.stream(); this.statePath = streamAndPath.path(); return; } catch (Exception e) { latestException = e; } } throw new IOException(“Could not open output stream for state backend”, latestException); } }FsCheckpointStateOutputStream继承了CheckpointStreamFactory.CheckpointStateOutputStream，它的构造器要指定basePath、fs、bufferSize、localStateThreshold这几个参数bufferSize用于指定writeBuffer的大小，在write(int b)方法，会判断如果pos大于writeBuffer大小的话，会执行flush操作；在write(byte[] b, int off, int len)方法，对于len大于等于writeBuffer.length / 2的会先flush，然后直接写到outStream；对于len小于writeBuffer.length / 2的，则直接写到writeBuffer(在这之前判断如果len大于remaining则拷贝remaining的数据到writeBuffer然后进行flush)closeAndGetHandle方法对于pos小于等于localStateThreshold的返回ByteStreamStateHandle，大于该阈值的则返回FileStateHandleFsCheckpointStorageLocationflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/filesystem/FsCheckpointStorageLocation.java/* * A storage location for checkpoints on a file system. /public class FsCheckpointStorageLocation extends FsCheckpointStreamFactory implements CheckpointStorageLocation { private final FileSystem fileSystem; private final Path checkpointDirectory; private final Path sharedStateDirectory; private final Path taskOwnedStateDirectory; private final Path metadataFilePath; private final CheckpointStorageLocationReference reference; private final int fileStateSizeThreshold; public FsCheckpointStorageLocation( FileSystem fileSystem, Path checkpointDir, Path sharedStateDir, Path taskOwnedStateDir, CheckpointStorageLocationReference reference, int fileStateSizeThreshold) { super(fileSystem, checkpointDir, sharedStateDir, fileStateSizeThreshold); checkArgument(fileStateSizeThreshold >= 0); this.fileSystem = checkNotNull(fileSystem); this.checkpointDirectory = checkNotNull(checkpointDir); this.sharedStateDirectory = checkNotNull(sharedStateDir); this.taskOwnedStateDirectory = checkNotNull(taskOwnedStateDir); this.reference = checkNotNull(reference); // the metadata file should not have entropy in its path Path metadataDir = EntropyInjector.removeEntropyMarkerIfPresent(fileSystem, checkpointDir); this.metadataFilePath = new Path(metadataDir, AbstractFsCheckpointStorage.METADATA_FILE_NAME); this.fileStateSizeThreshold = fileStateSizeThreshold; } // ———————————————————————— // Properties // ———————————————————————— public Path getCheckpointDirectory() { return checkpointDirectory; } public Path getSharedStateDirectory() { return sharedStateDirectory; } public Path getTaskOwnedStateDirectory() { return taskOwnedStateDirectory; } public Path getMetadataFilePath() { return metadataFilePath; } // ———————————————————————— // checkpoint metadata // ———————————————————————— @Override public CheckpointMetadataOutputStream createMetadataOutputStream() throws IOException { return new FsCheckpointMetadataOutputStream(fileSystem, metadataFilePath, checkpointDirectory); } @Override public void disposeOnFailure() throws IOException { // on a failure, no chunk in the checkpoint directory needs to be saved, so // we can drop it as a whole fileSystem.delete(checkpointDirectory, true); } @Override public CheckpointStorageLocationReference getLocationReference() { return reference; } // ———————————————————————— // Utilities // ———————————————————————— @Override public String toString() { return “FsCheckpointStorageLocation {” + “fileSystem=” + fileSystem + “, checkpointDirectory=” + checkpointDirectory + “, sharedStateDirectory=” + sharedStateDirectory + “, taskOwnedStateDirectory=” + taskOwnedStateDirectory + “, metadataFilePath=” + metadataFilePath + “, reference=” + reference + “, fileStateSizeThreshold=” + fileStateSizeThreshold + ‘}’; } @VisibleForTesting FileSystem getFileSystem() { return fileSystem; }}FsCheckpointStorageLocation实现了CheckpointStorageLocation接口的createMetadataOutputStream、disposeOnFailure、getLocationReference方法createMetadataOutputStream方法创建的是FsCheckpointMetadataOutputStream；disposeOnFailure方法直接执行fileSystem.delete(checkpointDirectory, true)删除文件；getLocationReference方法返回的是CheckpointStorageLocationReferenceFsCheckpointStorageLocation继承了FsCheckpointStreamFactory，因此拥有了createCheckpointStateOutputStream方法FsCheckpointMetadataOutputStreamflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/filesystem/FsCheckpointMetadataOutputStream.java/* * A {@link CheckpointMetadataOutputStream} that writes a specified file and directory, and * returns a {@link FsCompletedCheckpointStorageLocation} upon closing. */public final class FsCheckpointMetadataOutputStream extends CheckpointMetadataOutputStream { private static final Logger LOG = LoggerFactory.getLogger(FsCheckpointMetadataOutputStream.class); // ———————————————————————— private final FSDataOutputStream out; private final Path metadataFilePath; private final Path exclusiveCheckpointDir; private final FileSystem fileSystem; private volatile boolean closed; public FsCheckpointMetadataOutputStream( FileSystem fileSystem, Path metadataFilePath, Path exclusiveCheckpointDir) throws IOException { this.fileSystem = checkNotNull(fileSystem); this.metadataFilePath = checkNotNull(metadataFilePath); this.exclusiveCheckpointDir = checkNotNull(exclusiveCheckpointDir); this.out = fileSystem.create(metadataFilePath, WriteMode.NO_OVERWRITE); } // ———————————————————————— // I/O // ———————————————————————— @Override public final void write(int b) throws IOException { out.write(b); } @Override public final void write(@Nonnull byte[] b, int off, int len) throws IOException { out.write(b, off, len); } @Override public long getPos() throws IOException { return out.getPos(); } @Override public void flush() throws IOException { out.flush(); } @Override public void sync() throws IOException { out.sync(); } // ———————————————————————— // Closing // ———————————————————————— public boolean isClosed() { return closed; } @Override public void close() { if (!closed) { closed = true; try { out.close(); fileSystem.delete(metadataFilePath, false); } catch (Throwable t) { LOG.warn(“Could not close the state stream for {}.”, metadataFilePath, t); } } } @Override public FsCompletedCheckpointStorageLocation closeAndFinalizeCheckpoint() throws IOException { synchronized (this) { if (!closed) { try { // make a best effort attempt to figure out the size long size = 0; try { size = out.getPos(); } catch (Exception ignored) {} out.close(); FileStateHandle metaDataHandle = new FileStateHandle(metadataFilePath, size); return new FsCompletedCheckpointStorageLocation( fileSystem, exclusiveCheckpointDir, metaDataHandle, metaDataHandle.getFilePath().getParent().toString()); } catch (Exception e) { try { fileSystem.delete(metadataFilePath, false); } catch (Exception deleteException) { LOG.warn(“Could not delete the checkpoint stream file {}.”, metadataFilePath, deleteException); } throw new IOException(“Could not flush and close the file system " + “output stream to " + metadataFilePath + " in order to obtain the " + “stream state handle”, e); } finally { closed = true; } } else { throw new IOException(“Stream has already been closed and discarded.”); } } }}FsCheckpointMetadataOutputStream继承了CheckpointMetadataOutputStream，而CheckpointMetadataOutputStream继承了FSDataOutputStream；这里的closeAndFinalizeCheckpoint方法返回的是FsCompletedCheckpointStorageLocation小结FsCheckpointStorage的initializeLocationForCheckpoint方法、resolveCheckpointStorageLocation方法、createSavepointLocation方法创建的是FsCheckpointStorageLocation；而createTaskOwnedStateStream方法创建的是FsCheckpointStateOutputStreamFsCheckpointStorageLocation继承了FsCheckpointStreamFactory，同时实现了CheckpointStorageLocation接口的createMetadataOutputStream、disposeOnFailure、getLocationReference方法；createMetadataOutputStream方法创建的是FsCheckpointMetadataOutputStream(FsCheckpointMetadataOutputStream继承了CheckpointMetadataOutputStream，而CheckpointMetadataOutputStream继承了FSDataOutputStream；这里的closeAndFinalizeCheckpoint方法返回的是FsCompletedCheckpointStorageLocation)；disposeOnFailure方法直接执行fileSystem.delete(checkpointDirectory, true)删除文件；getLocationReference方法返回的是CheckpointStorageLocationReferenceFsCheckpointStreamFactory实现了CheckpointStreamFactory接口，这里createCheckpointStateOutputStream方法返回FsCheckpointStateOutputStream；FsCheckpointStateOutputStream继承了CheckpointStreamFactory.CheckpointStateOutputStream；它的构造器要指定basePath、fs、bufferSize、localStateThreshold这几个参数，closeAndGetHandle方法对于pos小于等于localStateThreshold的返回ByteStreamStateHandle，大于该阈值的则返回FileStateHandledocThe FsStateBackend聊聊flink的MemCheckpointStreamFactory ...

聊聊flink的MemCheckpointStreamFactory

序本文主要研究一下flink的MemCheckpointStreamFactoryCheckpointStreamFactoryflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/CheckpointStreamFactory.java/** * A factory for checkpoint output streams, which are used to persist data for checkpoints. * * Stream factories can be created from the {@link CheckpointStorage} through * {@link CheckpointStorage#resolveCheckpointStorageLocation(long, CheckpointStorageLocationReference)}. /public interface CheckpointStreamFactory { CheckpointStateOutputStream createCheckpointStateOutputStream(CheckpointedStateScope scope) throws IOException; abstract class CheckpointStateOutputStream extends FSDataOutputStream { @Nullable public abstract StreamStateHandle closeAndGetHandle() throws IOException; @Override public abstract void close() throws IOException; }}CheckpointStreamFactory为checkpoint output streams(用于持久化checkpoint的数据)的工厂，它定义了createCheckpointStateOutputStream方法，这里返回的是CheckpointStateOutputStream；CheckpointStateOutputStream继承了FSDataOutputStream，它定义了closeAndGetHandle及close两个抽象方法CheckpointStreamFactory有两个以factory命名的实现类，分别是MemCheckpointStreamFactory(它有两个子类分别为NonPersistentMetadataCheckpointStorageLocation、PersistentMetadataCheckpointStorageLocation)、FsCheckpointStreamFactory(它有一个子类为FsCheckpointStorageLocation)CheckpointStorageLocation接口继承了CheckpointStreamFactory接口，它有三个实现类，分别是NonPersistentMetadataCheckpointStorageLocation、PersistentMetadataCheckpointStorageLocation、FsCheckpointStorageLocationFSDataOutputStreamflink-core-1.7.0-sources.jar!/org/apache/flink/core/fs/FSDataOutputStream.java@Publicpublic abstract class FSDataOutputStream extends OutputStream { public abstract long getPos() throws IOException; public abstract void flush() throws IOException; public abstract void sync() throws IOException; public abstract void close() throws IOException;}FSDataOutputStream继承了java的OutputStream，它多定义了getPos、flush、sync、close几个抽象方法CheckpointStorageLocationflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/CheckpointStorageLocation.java/* * A storage location for one particular checkpoint, offering data persistent, metadata persistence, * and lifecycle/cleanup methods. * * CheckpointStorageLocations are typically created and initialized via * {@link CheckpointStorage#initializeLocationForCheckpoint(long)} or * {@link CheckpointStorage#initializeLocationForSavepoint(long, String)}. /public interface CheckpointStorageLocation extends CheckpointStreamFactory { CheckpointMetadataOutputStream createMetadataOutputStream() throws IOException; void disposeOnFailure() throws IOException; CheckpointStorageLocationReference getLocationReference();}CheckpointStorageLocation继承了CheckpointStreamFactory接口，它通常是由CheckpointStorage来创建及初始化，提供数据持久化、metadata存储及lifecycle/cleanup相关方法；这里定义了createMetadataOutputStream方法用来创建CheckpointMetadataOutputStream；disposeOnFailure方法用于在checkpoint失败的时候dispose checkpoint location；getLocationReference用于返回CheckpointStorageLocationReferenceMemCheckpointStreamFactoryflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/memory/MemCheckpointStreamFactory.java/* * {@link CheckpointStreamFactory} that produces streams that write to in-memory byte arrays. /public class MemCheckpointStreamFactory implements CheckpointStreamFactory { /* The maximal size that the snapshotted memory state may have / private final int maxStateSize; /* * Creates a new in-memory stream factory that accepts states whose serialized forms are * up to the given number of bytes. * * @param maxStateSize The maximal size of the serialized state / public MemCheckpointStreamFactory(int maxStateSize) { this.maxStateSize = maxStateSize; } @Override public CheckpointStateOutputStream createCheckpointStateOutputStream( CheckpointedStateScope scope) throws IOException { return new MemoryCheckpointOutputStream(maxStateSize); } @Override public String toString() { return “In-Memory Stream Factory”; } static void checkSize(int size, int maxSize) throws IOException { if (size > maxSize) { throw new IOException( “Size of the state is larger than the maximum permitted memory-backed state. Size=” + size + " , maxSize=" + maxSize + " . Consider using a different state backend, like the File System State backend."); } } /* * A {@code CheckpointStateOutputStream} that writes into a byte array. / public static class MemoryCheckpointOutputStream extends CheckpointStateOutputStream { private final ByteArrayOutputStreamWithPos os = new ByteArrayOutputStreamWithPos(); private final int maxSize; private AtomicBoolean closed; boolean isEmpty = true; public MemoryCheckpointOutputStream(int maxSize) { this.maxSize = maxSize; this.closed = new AtomicBoolean(false); } @Override public void write(int b) throws IOException { os.write(b); isEmpty = false; } @Override public void write(byte[] b, int off, int len) throws IOException { os.write(b, off, len); isEmpty = false; } @Override public void flush() throws IOException { os.flush(); } @Override public void sync() throws IOException { } // ——————————————————————– @Override public void close() { if (closed.compareAndSet(false, true)) { closeInternal(); } } @Nullable @Override public StreamStateHandle closeAndGetHandle() throws IOException { if (isEmpty) { return null; } return new ByteStreamStateHandle(String.valueOf(UUID.randomUUID()), closeAndGetBytes()); } @Override public long getPos() throws IOException { return os.getPosition(); } public boolean isClosed() { return closed.get(); } /* * Closes the stream and returns the byte array containing the stream’s data. * @return The byte array containing the stream’s data. * @throws IOException Thrown if the size of the data exceeds the maximal / public byte[] closeAndGetBytes() throws IOException { if (closed.compareAndSet(false, true)) { checkSize(os.size(), maxSize); byte[] bytes = os.toByteArray(); closeInternal(); return bytes; } else { throw new IOException(“stream has already been closed”); } } private void closeInternal() { os.reset(); } }}MemCheckpointStreamFactory实现了CheckpointStreamFactory接口，这里createCheckpointStateOutputStream方法返回MemoryCheckpointOutputStreamMemoryCheckpointOutputStream继承了CheckpointStateOutputStream，里头使用了ByteArrayOutputStreamWithPos，它在closeAndGetHandle的时候会校验大小是否超过maxSize的限制，超出则抛出IOException异常MemCheckpointStreamFactory有两个子类分别为NonPersistentMetadataCheckpointStorageLocation、PersistentMetadataCheckpointStorageLocation，它们都实现了CheckpointStorageLocation接口NonPersistentMetadataCheckpointStorageLocationflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/memory/NonPersistentMetadataCheckpointStorageLocation.java/* * A checkpoint storage location for the {@link MemoryStateBackend} in case no durable persistence * for metadata has been configured. /public class NonPersistentMetadataCheckpointStorageLocation extends MemCheckpointStreamFactory implements CheckpointStorageLocation { /* The external pointer returned for checkpoints that are not externally addressable. / public static final String EXTERNAL_POINTER = “<checkpoint-not-externally-addressable>”; public NonPersistentMetadataCheckpointStorageLocation(int maxStateSize) { super(maxStateSize); } @Override public CheckpointMetadataOutputStream createMetadataOutputStream() throws IOException { return new MetadataOutputStream(); } @Override public void disposeOnFailure() {} @Override public CheckpointStorageLocationReference getLocationReference() { return CheckpointStorageLocationReference.getDefault(); } // ———————————————————————— // CompletedCheckpointStorageLocation // ———————————————————————— /* * A {@link CompletedCheckpointStorageLocation} that is not persistent and only holds the * metadata in an internal byte array. / private static class NonPersistentCompletedCheckpointStorageLocation implements CompletedCheckpointStorageLocation { private static final long serialVersionUID = 1L; private final ByteStreamStateHandle metaDataHandle; NonPersistentCompletedCheckpointStorageLocation(ByteStreamStateHandle metaDataHandle) { this.metaDataHandle = metaDataHandle; } @Override public String getExternalPointer() { return EXTERNAL_POINTER; } @Override public StreamStateHandle getMetadataHandle() { return metaDataHandle; } @Override public void disposeStorageLocation() {} } // ———————————————————————— // CheckpointMetadataOutputStream // ———————————————————————— private static class MetadataOutputStream extends CheckpointMetadataOutputStream { private final ByteArrayOutputStreamWithPos os = new ByteArrayOutputStreamWithPos(); private boolean closed; @Override public void write(int b) throws IOException { os.write(b); } @Override public void write(byte[] b, int off, int len) throws IOException { os.write(b, off, len); } @Override public void flush() throws IOException { os.flush(); } @Override public long getPos() throws IOException { return os.getPosition(); } @Override public void sync() throws IOException { } @Override public CompletedCheckpointStorageLocation closeAndFinalizeCheckpoint() throws IOException { synchronized (this) { if (!closed) { closed = true; byte[] bytes = os.toByteArray(); ByteStreamStateHandle handle = new ByteStreamStateHandle(UUID.randomUUID().toString(), bytes); return new NonPersistentCompletedCheckpointStorageLocation(handle); } else { throw new IOException(“Already closed”); } } } @Override public void close() { if (!closed) { closed = true; os.reset(); } } }}MemoryBackendCheckpointStorage在没有配置checkpointsDirectory的时候创建的是NonPersistentMetadataCheckpointStorageLocation；其createMetadataOutputStream方法创建的是MetadataOutputStreamMetadataOutputStream继承了CheckpointMetadataOutputStream，里头使用的是ByteArrayOutputStreamWithPos，而closeAndFinalizeCheckpoint返回的是NonPersistentCompletedCheckpointStorageLocationNonPersistentCompletedCheckpointStorageLocation实现了CompletedCheckpointStorageLocation接口，其getMetadataHandle方法返回的是ByteStreamStateHandlePersistentMetadataCheckpointStorageLocationflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/memory/PersistentMetadataCheckpointStorageLocation.java/* * A checkpoint storage location for the {@link MemoryStateBackend} when it durably * persists the metadata in a file system. /public class PersistentMetadataCheckpointStorageLocation extends MemCheckpointStreamFactory implements CheckpointStorageLocation { private final FileSystem fileSystem; private final Path checkpointDirectory; private final Path metadataFilePath; /* * Creates a checkpoint storage persists metadata to a file system and stores state * in line in state handles with the metadata. * * @param fileSystem The file system to which the metadata will be written. * @param checkpointDir The directory where the checkpoint metadata will be written. */ public PersistentMetadataCheckpointStorageLocation( FileSystem fileSystem, Path checkpointDir, int maxStateSize) { super(maxStateSize); this.fileSystem = checkNotNull(fileSystem); this.checkpointDirectory = checkNotNull(checkpointDir); this.metadataFilePath = new Path(checkpointDir, AbstractFsCheckpointStorage.METADATA_FILE_NAME); } // ———————————————————————— @Override public CheckpointMetadataOutputStream createMetadataOutputStream() throws IOException { return new FsCheckpointMetadataOutputStream(fileSystem, metadataFilePath, checkpointDirectory); } @Override public void disposeOnFailure() throws IOException { // on a failure, no chunk in the checkpoint directory needs to be saved, so // we can drop it as a whole fileSystem.delete(checkpointDirectory, true); } @Override public CheckpointStorageLocationReference getLocationReference() { return CheckpointStorageLocationReference.getDefault(); }}MemoryBackendCheckpointStorage在配置了checkpointsDirectory的时候创建的是PersistentMetadataCheckpointStorageLocation；其createMetadataOutputStream方法创建的是FsCheckpointMetadataOutputStream；FsCheckpointMetadataOutputStream的构造器接收三个参数，分别是fileSystem、metadataFilePath、exclusiveCheckpointDir；其中fileSystem用于根据metadataFilePath来创建FSDataOutputStream，而exclusiveCheckpointDir则在返回FsCompletedCheckpointStorageLocation的时候用到小结MemoryBackendCheckpointStorage在没有配置checkpointsDirectory的时候创建的是NonPersistentMetadataCheckpointStorageLocation；在配置了checkpointsDirectory的时候创建的是PersistentMetadataCheckpointStorageLocationNonPersistentMetadataCheckpointStorageLocation及PersistentMetadataCheckpointStorageLocation都继承了MemCheckpointStreamFactory类，同时实现了CheckpointStorageLocation接口(其createMetadataOutputStream方法返回的CheckpointMetadataOutputStream类型分别为MetadataOutputStream、FsCheckpointMetadataOutputStream)MemCheckpointStreamFactory实现了CheckpointStreamFactory接口，它的createCheckpointStateOutputStream方法返回MemoryCheckpointOutputStream；CheckpointStorageLocation继承了CheckpointStreamFactory接口，它通常是由CheckpointStorage来创建及初始化，提供数据持久化、metadata存储及lifecycle/cleanup相关方法docThe MemoryStateBackend ...

聊聊flink的MemoryBackendCheckpointStorage

序本文主要研究一下flink的MemoryBackendCheckpointStorageCheckpointStorageflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/CheckpointStorage.java/** * CheckpointStorage implements the durable storage of checkpoint data and metadata streams. * An individual checkpoint or savepoint is stored to a {@link CheckpointStorageLocation}, * created by this class. /public interface CheckpointStorage { boolean supportsHighlyAvailableStorage(); boolean hasDefaultSavepointLocation(); CompletedCheckpointStorageLocation resolveCheckpoint(String externalPointer) throws IOException; CheckpointStorageLocation initializeLocationForCheckpoint(long checkpointId) throws IOException; CheckpointStorageLocation initializeLocationForSavepoint( long checkpointId, @Nullable String externalLocationPointer) throws IOException; CheckpointStreamFactory resolveCheckpointStorageLocation( long checkpointId, CheckpointStorageLocationReference reference) throws IOException; CheckpointStateOutputStream createTaskOwnedStateStream() throws IOException;}CheckpointStorage接口主要定义了持久化checkpoint data及metadata streams的基本方法；supportsHighlyAvailableStorage方法返回该backend是否支持highly available storage；hasDefaultSavepointLocation方法是否有默认的savepoint location；resolveCheckpoint方法用于解析checkpoint location返回CompletedCheckpointStorageLocation；initializeLocationForCheckpoint方法根据checkpointId来初始化storage location；initializeLocationForSavepoint方法用于根据checkpointId来初始化savepoint的storage location；resolveCheckpointStorageLocation方法解析CheckpointStorageLocationReference返回CheckpointStreamFactory；createTaskOwnedStateStream方法用于打开一个stream来持久化checkpoint stateAbstractFsCheckpointStorageflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/filesystem/AbstractFsCheckpointStorage.java/* * An implementation of durable checkpoint storage to file systems. /public abstract class AbstractFsCheckpointStorage implements CheckpointStorage { // ———————————————————————— // Constants // ———————————————————————— /* The prefix of the directory containing the data exclusive to a checkpoint. / public static final String CHECKPOINT_DIR_PREFIX = “chk-”; /* The name of the directory for shared checkpoint state. / public static final String CHECKPOINT_SHARED_STATE_DIR = “shared”; /* The name of the directory for state not owned/released by the master, but by the TaskManagers. / public static final String CHECKPOINT_TASK_OWNED_STATE_DIR = “taskowned”; /* The name of the metadata files in checkpoints / savepoints. / public static final String METADATA_FILE_NAME = “_metadata”; /* The magic number that is put in front of any reference. / private static final byte[] REFERENCE_MAGIC_NUMBER = new byte[] { 0x05, 0x5F, 0x3F, 0x18 }; // ———————————————————————— // Fields and properties // ———————————————————————— /* The jobId, written into the generated savepoint directories. / private final JobID jobId; /* The default location for savepoints. Null, if none is configured. / @Nullable private final Path defaultSavepointDirectory; @Override public boolean hasDefaultSavepointLocation() { return defaultSavepointDirectory != null; } @Override public CompletedCheckpointStorageLocation resolveCheckpoint(String checkpointPointer) throws IOException { return resolveCheckpointPointer(checkpointPointer); } /* * Creates a file system based storage location for a savepoint. * * This methods implements the logic that decides which location to use (given optional * parameters for a configured location and a location passed for this specific savepoint) * and how to name and initialize the savepoint directory. * * @param externalLocationPointer The target location pointer for the savepoint. * Must be a valid URI. Null, if not supplied. * @param checkpointId The checkpoint ID of the savepoint. * * @return The checkpoint storage location for the savepoint. * * @throws IOException Thrown if the target directory could not be created. / @Override public CheckpointStorageLocation initializeLocationForSavepoint( @SuppressWarnings(“unused”) long checkpointId, @Nullable String externalLocationPointer) throws IOException { // determine where to write the savepoint to final Path savepointBasePath; if (externalLocationPointer != null) { savepointBasePath = new Path(externalLocationPointer); } else if (defaultSavepointDirectory != null) { savepointBasePath = defaultSavepointDirectory; } else { throw new IllegalArgumentException(“No savepoint location given and no default location configured.”); } // generate the savepoint directory final FileSystem fs = savepointBasePath.getFileSystem(); final String prefix = “savepoint-” + jobId.toString().substring(0, 6) + ‘-’; Exception latestException = null; for (int attempt = 0; attempt < 10; attempt++) { final Path path = new Path(savepointBasePath, FileUtils.getRandomFilename(prefix)); try { if (fs.mkdirs(path)) { // we make the path qualified, to make it independent of default schemes and authorities final Path qp = path.makeQualified(fs); return createSavepointLocation(fs, qp); } } catch (Exception e) { latestException = e; } } throw new IOException(“Failed to create savepoint directory at " + savepointBasePath, latestException); } protected abstract CheckpointStorageLocation createSavepointLocation(FileSystem fs, Path location) throws IOException; //……}AbstractFsCheckpointStorage主要是实现了CheckpointStorage接口的hasDefaultSavepointLocation、resolveCheckpoint、initializeLocationForSavepoint方法resolveCheckpoint方法主要做两件事情，一个是解析checkpoint/savepoint path，一个是解析checkpoint/savepoint的metadata path，获取他们的FileStatus，然后创建FsCompletedCheckpointStorageLocationinitializeLocationForSavepoint方法主要是给savepoint创建一个CheckpointStorageLocation，它可以根据externalLocationPointer来创建，该值为null的话则使用defaultSavepointDirectory，该方法里头调用了createSavepointLocation抽象方法，由子类去实现MemoryBackendCheckpointStorageflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/memory/MemoryBackendCheckpointStorage.java/* * An implementation of a checkpoint storage for the {@link MemoryStateBackend}. * Depending on whether this is created with a checkpoint location, the setup supports * durable checkpoints (durable metadata) or not. /public class MemoryBackendCheckpointStorage extends AbstractFsCheckpointStorage { /* The target directory for checkpoints (here metadata files only). Null, if not configured. / @Nullable private final Path checkpointsDirectory; /* The file system to persist the checkpoints to. Null if this does not durably persist checkpoints. / @Nullable private final FileSystem fileSystem; /* The maximum size of state stored in a state handle. / private final int maxStateSize; /* * Creates a new MemoryBackendCheckpointStorage. * * @param jobId The ID of the job writing the checkpoints. * @param checkpointsBaseDirectory The directory to write checkpoints to. May be null, * in which case this storage does not support durable persistence. * @param defaultSavepointLocation The default savepoint directory, or null, if none is set. * @param maxStateSize The maximum size of each individual piece of state. * * @throws IOException Thrown if a checkpoint base directory is given configured and the * checkpoint directory cannot be created within that directory. / public MemoryBackendCheckpointStorage( JobID jobId, @Nullable Path checkpointsBaseDirectory, @Nullable Path defaultSavepointLocation, int maxStateSize) throws IOException { super(jobId, defaultSavepointLocation); checkArgument(maxStateSize > 0); this.maxStateSize = maxStateSize; if (checkpointsBaseDirectory == null) { checkpointsDirectory = null; fileSystem = null; } else { this.fileSystem = checkpointsBaseDirectory.getFileSystem(); this.checkpointsDirectory = getCheckpointDirectoryForJob(checkpointsBaseDirectory, jobId); fileSystem.mkdirs(checkpointsDirectory); } } // ———————————————————————— // Properties // ———————————————————————— /* * Gets the size (in bytes) that a individual chunk of state may have at most. */ public int getMaxStateSize() { return maxStateSize; } // ———————————————————————— // Checkpoint Storage // ———————————————————————— @Override public boolean supportsHighlyAvailableStorage() { return checkpointsDirectory != null; } @Override public CheckpointStorageLocation initializeLocationForCheckpoint(long checkpointId) throws IOException { checkArgument(checkpointId >= 0); if (checkpointsDirectory != null) { // configured for durable metadata // prepare all the paths needed for the checkpoints checkState(fileSystem != null); final Path checkpointDir = createCheckpointDirectory(checkpointsDirectory, checkpointId); // create the checkpoint exclusive directory fileSystem.mkdirs(checkpointDir); return new PersistentMetadataCheckpointStorageLocation(fileSystem, checkpointDir, maxStateSize); } else { // no durable metadata - typical in IDE or test setup case return new NonPersistentMetadataCheckpointStorageLocation(maxStateSize); } } @Override public CheckpointStreamFactory resolveCheckpointStorageLocation( long checkpointId, CheckpointStorageLocationReference reference) throws IOException { // no matter where the checkpoint goes, we always return the storage location that stores // state inline with the state handles. return new MemCheckpointStreamFactory(maxStateSize); } @Override public CheckpointStateOutputStream createTaskOwnedStateStream() throws IOException { return new MemoryCheckpointOutputStream(maxStateSize); } @Override protected CheckpointStorageLocation createSavepointLocation(FileSystem fs, Path location) throws IOException { return new PersistentMetadataCheckpointStorageLocation(fs, location, maxStateSize); } // ———————————————————————— // Utilities // ———————————————————————— @Override public String toString() { return “MemoryBackendCheckpointStorage {” + “checkpointsDirectory=” + checkpointsDirectory + “, fileSystem=” + fileSystem + “, maxStateSize=” + maxStateSize + ‘}’; }}MemoryBackendCheckpointStorage继承了AbstractFsCheckpointStorage，实现了它定义的createSavepointLocation方法，这里返回的是PersistentMetadataCheckpointStorageLocationMemoryBackendCheckpointStorage还实现了CheckpointStorage接口定义的AbstractFsCheckpointStorage未实现的几个方法：supportsHighlyAvailableStorage、initializeLocationForCheckpoint、resolveCheckpointStorageLocation、createTaskOwnedStateStreamsupportsHighlyAvailableStorage是根据是否有配置checkpointsDirectory来判断；initializeLocationForCheckpoint这个根据checkpointsDirectory是否有设置来创建，为null的话，创建的是NonPersistentMetadataCheckpointStorageLocation，不为null创建的是PersistentMetadataCheckpointStorageLocation；resolveCheckpointStorageLocation这里创建的是MemCheckpointStreamFactory；而createTaskOwnedStateStream创建的是MemoryCheckpointOutputStream小结CheckpointStorage接口主要定义了持久化checkpoint data及metadata streams的基本方法；AbstractFsCheckpointStorage主要是实现了CheckpointStorage接口的hasDefaultSavepointLocation、resolveCheckpoint、initializeLocationForSavepoint方法，同时定义了一个抽象方法createSavepointLocationMemoryBackendCheckpointStorage继承了AbstractFsCheckpointStorage，实现了它定义的createSavepointLocation方法，同时还实现了CheckpointStorage接口定义的AbstractFsCheckpointStorage未实现的几个方法：supportsHighlyAvailableStorage、initializeLocationForCheckpoint、resolveCheckpointStorageLocation、createTaskOwnedStateStream这里可以看到MemoryBackendCheckpointStorage虽然是memory的，但是如果有配置checkpointsDirectory(highly available storage)，checkpoint location使用的是PersistentMetadataCheckpointStorageLocation，否则使用NonPersistentMetadataCheckpointStorageLocation；而savepoint location使用的是PersistentMetadataCheckpointStorageLocation(checkpiont可以选择是否使用文件存储，而metadata只能使用文件存储)docThe MemoryStateBackend ...

聊聊flink的PartitionableListState

序本文主要研究一下flink的PartitionableListStatePartitionableListStateflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/DefaultOperatorStateBackend.java /** * Implementation of operator list state. * * @param <S> the type of an operator state partition. / static final class PartitionableListState<S> implements ListState<S> { /* * Meta information of the state, including state name, assignment mode, and serializer / private RegisteredOperatorStateBackendMetaInfo<S> stateMetaInfo; /* * The internal list the holds the elements of the state / private final ArrayList<S> internalList; /* * A serializer that allows to perform deep copies of internalList / private final ArrayListSerializer<S> internalListCopySerializer; PartitionableListState(RegisteredOperatorStateBackendMetaInfo<S> stateMetaInfo) { this(stateMetaInfo, new ArrayList<S>()); } private PartitionableListState( RegisteredOperatorStateBackendMetaInfo<S> stateMetaInfo, ArrayList<S> internalList) { this.stateMetaInfo = Preconditions.checkNotNull(stateMetaInfo); this.internalList = Preconditions.checkNotNull(internalList); this.internalListCopySerializer = new ArrayListSerializer<>(stateMetaInfo.getPartitionStateSerializer()); } private PartitionableListState(PartitionableListState<S> toCopy) { this(toCopy.stateMetaInfo.deepCopy(), toCopy.internalListCopySerializer.copy(toCopy.internalList)); } public void setStateMetaInfo(RegisteredOperatorStateBackendMetaInfo<S> stateMetaInfo) { this.stateMetaInfo = stateMetaInfo; } public RegisteredOperatorStateBackendMetaInfo<S> getStateMetaInfo() { return stateMetaInfo; } public PartitionableListState<S> deepCopy() { return new PartitionableListState<>(this); } @Override public void clear() { internalList.clear(); } @Override public Iterable<S> get() { return internalList; } @Override public void add(S value) { Preconditions.checkNotNull(value, “You cannot add null to a ListState.”); internalList.add(value); } @Override public String toString() { return “PartitionableListState{” + “stateMetaInfo=” + stateMetaInfo + “, internalList=” + internalList + ‘}’; } public long[] write(FSDataOutputStream out) throws IOException { long[] partitionOffsets = new long[internalList.size()]; DataOutputView dov = new DataOutputViewStreamWrapper(out); for (int i = 0; i < internalList.size(); ++i) { S element = internalList.get(i); partitionOffsets[i] = out.getPos(); getStateMetaInfo().getPartitionStateSerializer().serialize(element, dov); } return partitionOffsets; } @Override public void update(List<S> values) { internalList.clear(); addAll(values); } @Override public void addAll(List<S> values) { if (values != null && !values.isEmpty()) { internalList.addAll(values); } } }PartitionableListState是DefaultOperatorStateBackend使用的ListState实现，其内部使用的是ArrayList(internalList)来存储state，而stateMetaInfo使用的是RegisteredOperatorStateBackendMetaInfo；其write方法将internalList的数据序列化到FSDataOutputStream，并返回每个记录对应的offset数组(partitionOffsets)ListStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/ListState.java/* * {@link State} interface for partitioned list state in Operations. * The state is accessed and modified by user functions, and checkpointed consistently * by the system as part of the distributed snapshots. * * The state is only accessible by functions applied on a {@code KeyedStream}. The key is * automatically supplied by the system, so the function always sees the value mapped to the * key of the current element. That way, the system can handle stream and state partitioning * consistently together. * * @param <T> Type of values that this list state keeps. /@PublicEvolvingpublic interface ListState<T> extends MergingState<T, Iterable<T>> { /* * Updates the operator state accessible by {@link #get()} by updating existing values to * to the given list of values. The next time {@link #get()} is called (for the same state * partition) the returned state will represent the updated list. * * If null or an empty list is passed in, the state value will be null. * * @param values The new values for the state. * * @throws Exception The method may forward exception thrown internally (by I/O or functions). / void update(List<T> values) throws Exception; /* * Updates the operator state accessible by {@link #get()} by adding the given values * to existing list of values. The next time {@link #get()} is called (for the same state * partition) the returned state will represent the updated list. * * If null or an empty list is passed in, the state value remains unchanged. * * @param values The new values to be added to the state. * * @throws Exception The method may forward exception thrown internally (by I/O or functions). / void addAll(List<T> values) throws Exception;}ListState主要用于operation存储partitioned list state，它继承了MergingState接口(指定OUT的泛型为Iterable<T>)，同时声明了两个方法；其中update用于全量更新state，如果参数为null或者empty，那么state会被清空；addAll方法用于增量更新，如果参数为null或者empty，则保持不变，否则则新增给定的valuesMergingStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/MergingState.java/* * Extension of {@link AppendingState} that allows merging of state. That is, two instances * of {@link MergingState} can be combined into a single instance that contains all the * information of the two merged states. * * @param <IN> Type of the value that can be added to the state. * @param <OUT> Type of the value that can be retrieved from the state. /@PublicEvolvingpublic interface MergingState<IN, OUT> extends AppendingState<IN, OUT> { }MergingState接口仅仅是继承了AppendingState接口，用接口命名表示该state支持state合并AppendingStateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/AppendingState.java/* * Base interface for partitioned state that supports adding elements and inspecting the current * state. Elements can either be kept in a buffer (list-like) or aggregated into one value. * * The state is accessed and modified by user functions, and checkpointed consistently * by the system as part of the distributed snapshots. * * The state is only accessible by functions applied on a {@code KeyedStream}. The key is * automatically supplied by the system, so the function always sees the value mapped to the * key of the current element. That way, the system can handle stream and state partitioning * consistently together. * * @param <IN> Type of the value that can be added to the state. * @param <OUT> Type of the value that can be retrieved from the state. /@PublicEvolvingpublic interface AppendingState<IN, OUT> extends State { /* * Returns the current value for the state. When the state is not * partitioned the returned value is the same for all inputs in a given * operator instance. If state partitioning is applied, the value returned * depends on the current operator input, as the operator maintains an * independent state for each partition. * * NOTE TO IMPLEMENTERS: if the state is empty, then this method * should return {@code null}. * * @return The operator state value corresponding to the current input or {@code null} * if the state is empty. * * @throws Exception Thrown if the system cannot access the state. / OUT get() throws Exception; /* * Updates the operator state accessible by {@link #get()} by adding the given value * to the list of values. The next time {@link #get()} is called (for the same state * partition) the returned state will represent the updated list. * * If null is passed in, the state value will remain unchanged. * * @param value The new value for the state. * * @throws Exception Thrown if the system cannot access the state. / void add(IN value) throws Exception;}AppendingState是partitioned state的基本接口，它继承了State接口，同时声明了get、add两个方法；get方法用于返回当前state的值，如果为空则返回null；add方法用于给state添加值Stateflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/State.java/* * Interface that different types of partitioned state must implement. * * The state is only accessible by functions applied on a {@code KeyedStream}. The key is * automatically supplied by the system, so the function always sees the value mapped to the * key of the current element. That way, the system can handle stream and state partitioning * consistently together. /@PublicEvolvingpublic interface State { /* * Removes the value mapped under the current key. / void clear();}State接口定义了所有不同partitioned state实现必须实现的方法，这里定义了clear方法用于清空当前state的所有值RegisteredOperatorStateBackendMetaInfoflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/RegisteredOperatorStateBackendMetaInfo.java/* * Compound meta information for a registered state in an operator state backend. * This contains the state name, assignment mode, and state partition serializer. * * @param <S> Type of the state. /public class RegisteredOperatorStateBackendMetaInfo<S> extends RegisteredStateMetaInfoBase { /* * The mode how elements in this state are assigned to tasks during restore / @Nonnull private final OperatorStateHandle.Mode assignmentMode; /* * The type serializer for the elements in the state list / @Nonnull private final TypeSerializer<S> partitionStateSerializer; public RegisteredOperatorStateBackendMetaInfo( @Nonnull String name, @Nonnull TypeSerializer<S> partitionStateSerializer, @Nonnull OperatorStateHandle.Mode assignmentMode) { super(name); this.partitionStateSerializer = partitionStateSerializer; this.assignmentMode = assignmentMode; } private RegisteredOperatorStateBackendMetaInfo(@Nonnull RegisteredOperatorStateBackendMetaInfo<S> copy) { this( Preconditions.checkNotNull(copy).name, copy.partitionStateSerializer.duplicate(), copy.assignmentMode); } @SuppressWarnings(“unchecked”) public RegisteredOperatorStateBackendMetaInfo(@Nonnull StateMetaInfoSnapshot snapshot) { this( snapshot.getName(), (TypeSerializer<S>) Preconditions.checkNotNull( snapshot.restoreTypeSerializer(StateMetaInfoSnapshot.CommonSerializerKeys.VALUE_SERIALIZER)), OperatorStateHandle.Mode.valueOf( snapshot.getOption(StateMetaInfoSnapshot.CommonOptionsKeys.OPERATOR_STATE_DISTRIBUTION_MODE))); Preconditions.checkState(StateMetaInfoSnapshot.BackendStateType.OPERATOR == snapshot.getBackendStateType()); } /* * Creates a deep copy of the itself. */ @Nonnull public RegisteredOperatorStateBackendMetaInfo<S> deepCopy() { return new RegisteredOperatorStateBackendMetaInfo<>(this); } @Nonnull @Override public StateMetaInfoSnapshot snapshot() { return computeSnapshot(); } //…… @Nonnull private StateMetaInfoSnapshot computeSnapshot() { Map<String, String> optionsMap = Collections.singletonMap( StateMetaInfoSnapshot.CommonOptionsKeys.OPERATOR_STATE_DISTRIBUTION_MODE.toString(), assignmentMode.toString()); String valueSerializerKey = StateMetaInfoSnapshot.CommonSerializerKeys.VALUE_SERIALIZER.toString(); Map<String, TypeSerializer<?>> serializerMap = Collections.singletonMap(valueSerializerKey, partitionStateSerializer.duplicate()); Map<String, TypeSerializerSnapshot<?>> serializerConfigSnapshotsMap = Collections.singletonMap(valueSerializerKey, partitionStateSerializer.snapshotConfiguration()); return new StateMetaInfoSnapshot( name, StateMetaInfoSnapshot.BackendStateType.OPERATOR, optionsMap, serializerConfigSnapshotsMap, serializerMap); }}RegisteredOperatorStateBackendMetaInfo继承了抽象类RegisteredStateMetaInfoBase，实现了snapshot的抽象方法，这里是通过computeSnapshot方法来实现；computeSnapshot方法主要是构造StateMetaInfoSnapshot所需的optionsMap、serializerConfigSnapshotsMap、serializerMap小结flink的manageed operator state仅仅支持ListState，DefaultOperatorStateBackend使用的ListState实现是PartitionableListState，其内部使用的是ArrayList(internalList)来存储state，而stateMetaInfo使用的是RegisteredOperatorStateBackendMetaInfoPartitionableListState实现了ListState接口(update、addAll方法)；而ListState接口继承了MergingState接口(指定OUT的泛型为Iterable<T>)；MergingState接口没有声明其他方法，它继承了AppendingState接口；AppendingState接口继承了State接口，同时声明了get、add方法；State接口则定义了clear方法RegisteredOperatorStateBackendMetaInfo继承了抽象类RegisteredStateMetaInfoBase，实现了snapshot的抽象方法，这里是通过computeSnapshot方法来实现；computeSnapshot方法主要是构造StateMetaInfoSnapshot所需的optionsMap、serializerConfigSnapshotsMap、serializerMapdocListStateflink state package summaryUsing Managed Operator State ...

聊聊flink的OperatorStateBackend

序本文主要研究一下flink的OperatorStateBackendOperatorStateBackendflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/OperatorStateBackend.java/** * Interface that combines both, the user facing {@link OperatorStateStore} interface and the system interface * {@link Snapshotable} * /public interface OperatorStateBackend extends OperatorStateStore, Snapshotable<SnapshotResult<OperatorStateHandle>, Collection<OperatorStateHandle>>, Closeable, Disposable { @Override void dispose();}OperatorStateBackend接口继承了OperatorStateStore、Snapshotable、Closeable、Disposable接口OperatorStateStoreflink-core-1.7.0-sources.jar!/org/apache/flink/api/common/state/OperatorStateStore.java/* * This interface contains methods for registering operator state with a managed store. /@PublicEvolvingpublic interface OperatorStateStore { <K, V> BroadcastState<K, V> getBroadcastState(MapStateDescriptor<K, V> stateDescriptor) throws Exception; <S> ListState<S> getListState(ListStateDescriptor<S> stateDescriptor) throws Exception; <S> ListState<S> getUnionListState(ListStateDescriptor<S> stateDescriptor) throws Exception; Set<String> getRegisteredStateNames(); Set<String> getRegisteredBroadcastStateNames(); // ——————————————————————————————- // Deprecated methods // ——————————————————————————————- @Deprecated <S> ListState<S> getOperatorState(ListStateDescriptor<S> stateDescriptor) throws Exception; @Deprecated <T extends Serializable> ListState<T> getSerializableListState(String stateName) throws Exception;}OperatorStateStore定义了getBroadcastState、getListState、getUnionListState方法用于create或restore BroadcastState或者ListState；同时也定义了getRegisteredStateNames、getRegisteredBroadcastStateNames用于返回当前注册的state的名称Snapshotableflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/Snapshotable.java/* * Interface for operators that can perform snapshots of their state. * * @param <S> Generic type of the state object that is created as handle to snapshots. * @param <R> Generic type of the state object that used in restore. /@Internalpublic interface Snapshotable<S extends StateObject, R> extends SnapshotStrategy<S> { /* * Restores state that was previously snapshotted from the provided parameters. Typically the parameters are state * handles from which the old state is read. * * @param state the old state to restore. / void restore(@Nullable R state) throws Exception;}Snapshotable接口继承了SnapshotStrategy接口，同时定义了restore方法用于restore stateSnapshotStrategyflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/SnapshotStrategy.java/* * Interface for different snapshot approaches in state backends. Implementing classes should ideally be stateless or at * least threadsafe, i.e. this is a functional interface and is can be called in parallel by multiple checkpoints. * * @param <S> type of the returned state object that represents the result of the snapshot operation. /@Internalpublic interface SnapshotStrategy<S extends StateObject> { /* * Operation that writes a snapshot into a stream that is provided by the given {@link CheckpointStreamFactory} and * returns a @{@link RunnableFuture} that gives a state handle to the snapshot. It is up to the implementation if * the operation is performed synchronous or asynchronous. In the later case, the returned Runnable must be executed * first before obtaining the handle. * * @param checkpointId The ID of the checkpoint. * @param timestamp The timestamp of the checkpoint. * @param streamFactory The factory that we can use for writing our state to streams. * @param checkpointOptions Options for how to perform this checkpoint. * @return A runnable future that will yield a {@link StateObject}. / @Nonnull RunnableFuture<S> snapshot( long checkpointId, long timestamp, @Nonnull CheckpointStreamFactory streamFactory, @Nonnull CheckpointOptions checkpointOptions) throws Exception;}SnapshotStrategy定义了snapshot方法，给不同的snapshot策略去实现，这里要求snapshot结果返回的类型是StateObject类型AbstractSnapshotStrategyflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/AbstractSnapshotStrategy.java/* * Abstract base class for implementing {@link SnapshotStrategy}, that gives a consistent logging across state backends. * * @param <T> type of the snapshot result. /public abstract class AbstractSnapshotStrategy<T extends StateObject> implements SnapshotStrategy<SnapshotResult<T>> { private static final Logger LOG = LoggerFactory.getLogger(AbstractSnapshotStrategy.class); private static final String LOG_SYNC_COMPLETED_TEMPLATE = “{} ({}, synchronous part) in thread {} took {} ms.”; private static final String LOG_ASYNC_COMPLETED_TEMPLATE = “{} ({}, asynchronous part) in thread {} took {} ms.”; /* Descriptive name of the snapshot strategy that will appear in the log outputs and {@link #toString()}. / @Nonnull protected final String description; protected AbstractSnapshotStrategy(@Nonnull String description) { this.description = description; } /* * Logs the duration of the synchronous snapshot part from the given start time. / public void logSyncCompleted(@Nonnull Object checkpointOutDescription, long startTime) { logCompletedInternal(LOG_SYNC_COMPLETED_TEMPLATE, checkpointOutDescription, startTime); } /* * Logs the duration of the asynchronous snapshot part from the given start time. / public void logAsyncCompleted(@Nonnull Object checkpointOutDescription, long startTime) { logCompletedInternal(LOG_ASYNC_COMPLETED_TEMPLATE, checkpointOutDescription, startTime); } private void logCompletedInternal( @Nonnull String template, @Nonnull Object checkpointOutDescription, long startTime) { long duration = (System.currentTimeMillis() - startTime); LOG.debug( template, description, checkpointOutDescription, Thread.currentThread(), duration); } @Override public String toString() { return “SnapshotStrategy {” + description + “}”; }}AbstractSnapshotStrategy是个抽象类，它没有实现SnapshotStrategy定义的snapshot方法，这里只是提供了logSyncCompleted方法打印debug信息StateObjectflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/StateObject.java/* * Base of all handles that represent checkpointed state in some form. The object may hold * the (small) state directly, or contain a file path (state is in the file), or contain the * metadata to access the state stored in some external database. * * State objects define how to {@link #discardState() discard state} and how to access the * {@link #getStateSize() size of the state}. * * State Objects are transported via RPC between JobManager and * TaskManager and must be {@link java.io.Serializable serializable} to support that. * * Some State Objects are stored in the checkpoint/savepoint metadata. For long-term * compatibility, they are not stored via {@link java.io.Serializable Java Serialization}, * but through custom serializers. /public interface StateObject extends Serializable { void discardState() throws Exception; long getStateSize();}StateObject继承了Serializable接口，因为会通过rpc在JobManager及TaskManager之间进行传输；这个接口定义了discardState及getStateSize方法，discardState用于清理资源，而getStateSize用于返回state的大小StreamStateHandleflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/StreamStateHandle.java/* * A {@link StateObject} that represents state that was written to a stream. The data can be read * back via {@link #openInputStream()}. /public interface StreamStateHandle extends StateObject { /* * Returns an {@link FSDataInputStream} that can be used to read back the data that * was previously written to the stream. / FSDataInputStream openInputStream() throws IOException;}StreamStateHandle继承了StateObject接口，多定义了openInputStream方法OperatorStateHandleflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/OperatorStateHandle.java/* * Interface of a state handle for operator state. /public interface OperatorStateHandle extends StreamStateHandle { /* * Returns a map of meta data for all contained states by their name. / Map<String, StateMetaInfo> getStateNameToPartitionOffsets(); /* * Returns an input stream to read the operator state information. / @Override FSDataInputStream openInputStream() throws IOException; /* * Returns the underlying stream state handle that points to the state data. / StreamStateHandle getDelegateStateHandle(); //……}OperatorStateHandle继承了StreamStateHandle，它多定义了getStateNameToPartitionOffsets、getDelegateStateHandle方法，其中getStateNameToPartitionOffsets提供了state name到可用partitions的offset的映射信息OperatorStreamStateHandleflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/OperatorStreamStateHandle.java/* * State handle for partitionable operator state. Besides being a {@link StreamStateHandle}, this also provides a * map that contains the offsets to the partitions of named states in the stream. /public class OperatorStreamStateHandle implements OperatorStateHandle { private static final long serialVersionUID = 35876522969227335L; /* * unique state name -> offsets for available partitions in the handle stream / private final Map<String, StateMetaInfo> stateNameToPartitionOffsets; private final StreamStateHandle delegateStateHandle; public OperatorStreamStateHandle( Map<String, StateMetaInfo> stateNameToPartitionOffsets, StreamStateHandle delegateStateHandle) { this.delegateStateHandle = Preconditions.checkNotNull(delegateStateHandle); this.stateNameToPartitionOffsets = Preconditions.checkNotNull(stateNameToPartitionOffsets); } @Override public Map<String, StateMetaInfo> getStateNameToPartitionOffsets() { return stateNameToPartitionOffsets; } @Override public void discardState() throws Exception { delegateStateHandle.discardState(); } @Override public long getStateSize() { return delegateStateHandle.getStateSize(); } @Override public FSDataInputStream openInputStream() throws IOException { return delegateStateHandle.openInputStream(); } @Override public StreamStateHandle getDelegateStateHandle() { return delegateStateHandle; } //……}OperatorStreamStateHandle实现了OperatorStateHandle接口，它定义了stateNameToPartitionOffsets属性(Map<String, StateMetaInfo>)，而getStateNameToPartitionOffsets方法就是返回的stateNameToPartitionOffsets属性SnapshotResultflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/SnapshotResult.java/* * This class contains the combined results from the snapshot of a state backend: * <ul> * <li>A state object representing the state that will be reported to the Job Manager to acknowledge the checkpoint.</li> * <li>A state object that represents the state for the {@link TaskLocalStateStoreImpl}.</li> * </ul> * * Both state objects are optional and can be null, e.g. if there was no state to snapshot in the backend. A local * state object that is not null also requires a state to report to the job manager that is not null, because the * Job Manager always owns the ground truth about the checkpointed state. /public class SnapshotResult<T extends StateObject> implements StateObject { private static final long serialVersionUID = 1L; /* An singleton instance to represent an empty snapshot result. / private static final SnapshotResult<?> EMPTY = new SnapshotResult<>(null, null); /* This is the state snapshot that will be reported to the Job Manager to acknowledge a checkpoint. / private final T jobManagerOwnedSnapshot; /* This is the state snapshot that will be reported to the Job Manager to acknowledge a checkpoint. / private final T taskLocalSnapshot; /* * Creates a {@link SnapshotResult} for the given jobManagerOwnedSnapshot and taskLocalSnapshot. If the * jobManagerOwnedSnapshot is null, taskLocalSnapshot must also be null. * * @param jobManagerOwnedSnapshot Snapshot for report to job manager. Can be null. * @param taskLocalSnapshot Snapshot for report to local state manager. This is optional and requires * jobManagerOwnedSnapshot to be not null if this is not also null. / private SnapshotResult(T jobManagerOwnedSnapshot, T taskLocalSnapshot) { if (jobManagerOwnedSnapshot == null && taskLocalSnapshot != null) { throw new IllegalStateException(“Cannot report local state snapshot without corresponding remote state!”); } this.jobManagerOwnedSnapshot = jobManagerOwnedSnapshot; this.taskLocalSnapshot = taskLocalSnapshot; } public T getJobManagerOwnedSnapshot() { return jobManagerOwnedSnapshot; } public T getTaskLocalSnapshot() { return taskLocalSnapshot; } @Override public void discardState() throws Exception { Exception aggregatedExceptions = null; if (jobManagerOwnedSnapshot != null) { try { jobManagerOwnedSnapshot.discardState(); } catch (Exception remoteDiscardEx) { aggregatedExceptions = remoteDiscardEx; } } if (taskLocalSnapshot != null) { try { taskLocalSnapshot.discardState(); } catch (Exception localDiscardEx) { aggregatedExceptions = ExceptionUtils.firstOrSuppressed(localDiscardEx, aggregatedExceptions); } } if (aggregatedExceptions != null) { throw aggregatedExceptions; } } @Override public long getStateSize() { return jobManagerOwnedSnapshot != null ? jobManagerOwnedSnapshot.getStateSize() : 0L; } @SuppressWarnings(“unchecked”) public static <T extends StateObject> SnapshotResult<T> empty() { return (SnapshotResult<T>) EMPTY; } public static <T extends StateObject> SnapshotResult<T> of(@Nullable T jobManagerState) { return jobManagerState != null ? new SnapshotResult<>(jobManagerState, null) : empty(); } public static <T extends StateObject> SnapshotResult<T> withLocalState( @Nonnull T jobManagerState, @Nonnull T localState) { return new SnapshotResult<>(jobManagerState, localState); }}SnapshotResult类实现了StateObject接口，它包装了snapshot的结果，这里包括jobManagerOwnedSnapshot、taskLocalSnapshot；它实现的discardState方法，调用了jobManagerOwnedSnapshot及taskLocalSnapshot的discardState方法；getStateSize方法则返回的是jobManagerOwnedSnapshot的stateSizeDefaultOperatorStateBackendflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/DefaultOperatorStateBackend.java/* * Default implementation of OperatorStateStore that provides the ability to make snapshots. /@Internalpublic class DefaultOperatorStateBackend implements OperatorStateBackend { private static final Logger LOG = LoggerFactory.getLogger(DefaultOperatorStateBackend.class); /* * The default namespace for state in cases where no state name is provided / public static final String DEFAULT_OPERATOR_STATE_NAME = “default”; /* * Map for all registered operator states. Maps state name -> state / private final Map<String, PartitionableListState<?>> registeredOperatorStates; /* * Map for all registered operator broadcast states. Maps state name -> state / private final Map<String, BackendWritableBroadcastState<?, ?>> registeredBroadcastStates; /* * CloseableRegistry to participate in the tasks lifecycle. / private final CloseableRegistry closeStreamOnCancelRegistry; /* * Default serializer. Only used for the default operator state. / private final JavaSerializer<Serializable> javaSerializer; /* * The user code classloader. / private final ClassLoader userClassloader; /* * The execution configuration. / private final ExecutionConfig executionConfig; /* * Flag to de/activate asynchronous snapshots. / private final boolean asynchronousSnapshots; /* * Map of state names to their corresponding restored state meta info. * * TODO this map can be removed when eager-state registration is in place. * TODO we currently need this cached to check state migration strategies when new serializers are registered. / private final Map<String, StateMetaInfoSnapshot> restoredOperatorStateMetaInfos; /* * Map of state names to their corresponding restored broadcast state meta info. / private final Map<String, StateMetaInfoSnapshot> restoredBroadcastStateMetaInfos; /* * Cache of already accessed states. * * In contrast to {@link #registeredOperatorStates} and {@link #restoredOperatorStateMetaInfos} which may be repopulated * with restored state, this map is always empty at the beginning. * * TODO this map should be moved to a base class once we have proper hierarchy for the operator state backends. * * @see <a href=“https://issues.apache.org/jira/browse/FLINK-6849">FLINK-6849</a> / private final HashMap<String, PartitionableListState<?>> accessedStatesByName; private final Map<String, BackendWritableBroadcastState<?, ?>> accessedBroadcastStatesByName; private final AbstractSnapshotStrategy<OperatorStateHandle> snapshotStrategy; public DefaultOperatorStateBackend( ClassLoader userClassLoader, ExecutionConfig executionConfig, boolean asynchronousSnapshots) { this.closeStreamOnCancelRegistry = new CloseableRegistry(); this.userClassloader = Preconditions.checkNotNull(userClassLoader); this.executionConfig = executionConfig; this.javaSerializer = new JavaSerializer<>(); this.registeredOperatorStates = new HashMap<>(); this.registeredBroadcastStates = new HashMap<>(); this.asynchronousSnapshots = asynchronousSnapshots; this.accessedStatesByName = new HashMap<>(); this.accessedBroadcastStatesByName = new HashMap<>(); this.restoredOperatorStateMetaInfos = new HashMap<>(); this.restoredBroadcastStateMetaInfos = new HashMap<>(); this.snapshotStrategy = new DefaultOperatorStateBackendSnapshotStrategy(); } @Override public Set<String> getRegisteredStateNames() { return registeredOperatorStates.keySet(); } @Override public Set<String> getRegisteredBroadcastStateNames() { return registeredBroadcastStates.keySet(); } @Override public void close() throws IOException { closeStreamOnCancelRegistry.close(); } @Override public void dispose() { IOUtils.closeQuietly(closeStreamOnCancelRegistry); registeredOperatorStates.clear(); registeredBroadcastStates.clear(); } // ——————————————————————————————- // State access methods // ——————————————————————————————- @SuppressWarnings(“unchecked”) @Override public <K, V> BroadcastState<K, V> getBroadcastState(final MapStateDescriptor<K, V> stateDescriptor) throws StateMigrationException { //…… } @Override public <S> ListState<S> getListState(ListStateDescriptor<S> stateDescriptor) throws Exception { return getListState(stateDescriptor, OperatorStateHandle.Mode.SPLIT_DISTRIBUTE); } @Override public <S> ListState<S> getUnionListState(ListStateDescriptor<S> stateDescriptor) throws Exception { return getListState(stateDescriptor, OperatorStateHandle.Mode.UNION); } @Nonnull @Override public RunnableFuture<SnapshotResult<OperatorStateHandle>> snapshot( long checkpointId, long timestamp, @Nonnull CheckpointStreamFactory streamFactory, @Nonnull CheckpointOptions checkpointOptions) throws Exception { long syncStartTime = System.currentTimeMillis(); RunnableFuture<SnapshotResult<OperatorStateHandle>> snapshotRunner = snapshotStrategy.snapshot(checkpointId, timestamp, streamFactory, checkpointOptions); snapshotStrategy.logSyncCompleted(streamFactory, syncStartTime); return snapshotRunner; } //……}DefaultOperatorStateBackend实现了OperatorStateBackend接口getRegisteredStateNames方法返回的是registeredOperatorStates.keySet()；getRegisteredBroadcastStateNames方法返回的是registeredBroadcastStates.keySet()，可以看到这两个都是基于内存的Map来实现的close方法主要是调用closeStreamOnCancelRegistry的close方法；dispose方法也会关闭closeStreamOnCancelRegistry，同时清空registeredOperatorStates及registeredBroadcastStatesgetListState及getUnionListState方法都调用了getListState(ListStateDescriptor<S> stateDescriptor,OperatorStateHandle.Mode mode)方法snapshot方法使用的snapshotStrategy是DefaultOperatorStateBackendSnapshotStrategyDefaultOperatorStateBackend.getListStateflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/DefaultOperatorStateBackend.java private <S> ListState<S> getListState( ListStateDescriptor<S> stateDescriptor, OperatorStateHandle.Mode mode) throws StateMigrationException { Preconditions.checkNotNull(stateDescriptor); String name = Preconditions.checkNotNull(stateDescriptor.getName()); @SuppressWarnings(“unchecked”) PartitionableListState<S> previous = (PartitionableListState<S>) accessedStatesByName.get(name); if (previous != null) { checkStateNameAndMode( previous.getStateMetaInfo().getName(), name, previous.getStateMetaInfo().getAssignmentMode(), mode); return previous; } // end up here if its the first time access after execution for the // provided state name; check compatibility of restored state, if any // TODO with eager registration in place, these checks should be moved to restore() stateDescriptor.initializeSerializerUnlessSet(getExecutionConfig()); TypeSerializer<S> partitionStateSerializer = Preconditions.checkNotNull(stateDescriptor.getElementSerializer()); @SuppressWarnings(“unchecked”) PartitionableListState<S> partitionableListState = (PartitionableListState<S>) registeredOperatorStates.get(name); if (null == partitionableListState) { // no restored state for the state name; simply create new state holder partitionableListState = new PartitionableListState<>( new RegisteredOperatorStateBackendMetaInfo<>( name, partitionStateSerializer, mode)); registeredOperatorStates.put(name, partitionableListState); } else { // has restored state; check compatibility of new state access checkStateNameAndMode( partitionableListState.getStateMetaInfo().getName(), name, partitionableListState.getStateMetaInfo().getAssignmentMode(), mode); StateMetaInfoSnapshot restoredSnapshot = restoredOperatorStateMetaInfos.get(name); RegisteredOperatorStateBackendMetaInfo<S> metaInfo = new RegisteredOperatorStateBackendMetaInfo<>(restoredSnapshot); // check compatibility to determine if state migration is required TypeSerializer<S> newPartitionStateSerializer = partitionStateSerializer.duplicate(); @SuppressWarnings(“unchecked”) TypeSerializerSnapshot<S> stateSerializerSnapshot = Preconditions.checkNotNull( (TypeSerializerSnapshot<S>) restoredSnapshot.getTypeSerializerConfigSnapshot(StateMetaInfoSnapshot.CommonSerializerKeys.VALUE_SERIALIZER)); TypeSerializerSchemaCompatibility<S> stateCompatibility = stateSerializerSnapshot.resolveSchemaCompatibility(newPartitionStateSerializer); if (stateCompatibility.isIncompatible()) { throw new StateMigrationException(“The new state serializer for operator state must not be incompatible.”); } partitionableListState.setStateMetaInfo( new RegisteredOperatorStateBackendMetaInfo<>(name, newPartitionStateSerializer, mode)); } accessedStatesByName.put(name, partitionableListState); return partitionableListState; }从registeredOperatorStates获取对应PartitionableListState，没有的话则创建，有的话则检查下兼容性，然后往partitionableListState设置stateMetaInfoDefaultOperatorStateBackendSnapshotStrategyflink-runtime_2.11-1.7.0-sources.jar!/org/apache/flink/runtime/state/DefaultOperatorStateBackend.java /* * Snapshot strategy for this backend. */ private class DefaultOperatorStateBackendSnapshotStrategy extends AbstractSnapshotStrategy<OperatorStateHandle> { protected DefaultOperatorStateBackendSnapshotStrategy() { super(“DefaultOperatorStateBackend snapshot”); } @Nonnull @Override public RunnableFuture<SnapshotResult<OperatorStateHandle>> snapshot( final long checkpointId, final long timestamp, @Nonnull final CheckpointStreamFactory streamFactory, @Nonnull final CheckpointOptions checkpointOptions) throws IOException { if (registeredOperatorStates.isEmpty() && registeredBroadcastStates.isEmpty()) { return DoneFuture.of(SnapshotResult.empty()); } final Map<String, PartitionableListState<?>> registeredOperatorStatesDeepCopies = new HashMap<>(registeredOperatorStates.size()); final Map<String, BackendWritableBroadcastState<?, ?>> registeredBroadcastStatesDeepCopies = new HashMap<>(registeredBroadcastStates.size()); ClassLoader snapshotClassLoader = Thread.currentThread().getContextClassLoader(); Thread.currentThread().setContextClassLoader(userClassloader); try { // eagerly create deep copies of the list and the broadcast states (if any) // in the synchronous phase, so that we can use them in the async writing. if (!registeredOperatorStates.isEmpty()) { for (Map.Entry<String, PartitionableListState<?>> entry : registeredOperatorStates.entrySet()) { PartitionableListState<?> listState = entry.getValue(); if (null != listState) { listState = listState.deepCopy(); } registeredOperatorStatesDeepCopies.put(entry.getKey(), listState); } } if (!registeredBroadcastStates.isEmpty()) { for (Map.Entry<String, BackendWritableBroadcastState<?, ?>> entry : registeredBroadcastStates.entrySet()) { BackendWritableBroadcastState<?, ?> broadcastState = entry.getValue(); if (null != broadcastState) { broadcastState = broadcastState.deepCopy(); } registeredBroadcastStatesDeepCopies.put(entry.getKey(), broadcastState); } } } finally { Thread.currentThread().setContextClassLoader(snapshotClassLoader); } AsyncSnapshotCallable<SnapshotResult<OperatorStateHandle>> snapshotCallable = new AsyncSnapshotCallable<SnapshotResult<OperatorStateHandle>>() { @Override protected SnapshotResult<OperatorStateHandle> callInternal() throws Exception { CheckpointStreamFactory.CheckpointStateOutputStream localOut = streamFactory.createCheckpointStateOutputStream(CheckpointedStateScope.EXCLUSIVE); registerCloseableForCancellation(localOut); // get the registered operator state infos … List<StateMetaInfoSnapshot> operatorMetaInfoSnapshots = new ArrayList<>(registeredOperatorStatesDeepCopies.size()); for (Map.Entry<String, PartitionableListState<?>> entry : registeredOperatorStatesDeepCopies.entrySet()) { operatorMetaInfoSnapshots.add(entry.getValue().getStateMetaInfo().snapshot()); } // … get the registered broadcast operator state infos … List<StateMetaInfoSnapshot> broadcastMetaInfoSnapshots = new ArrayList<>(registeredBroadcastStatesDeepCopies.size()); for (Map.Entry<String, BackendWritableBroadcastState<?, ?>> entry : registeredBroadcastStatesDeepCopies.entrySet()) { broadcastMetaInfoSnapshots.add(entry.getValue().getStateMetaInfo().snapshot()); } // … write them all in the checkpoint stream … DataOutputView dov = new DataOutputViewStreamWrapper(localOut); OperatorBackendSerializationProxy backendSerializationProxy = new OperatorBackendSerializationProxy(operatorMetaInfoSnapshots, broadcastMetaInfoSnapshots); backendSerializationProxy.write(dov); // … and then go for the states … // we put BOTH normal and broadcast state metadata here int initialMapCapacity = registeredOperatorStatesDeepCopies.size() + registeredBroadcastStatesDeepCopies.size(); final Map<String, OperatorStateHandle.StateMetaInfo> writtenStatesMetaData = new HashMap<>(initialMapCapacity); for (Map.Entry<String, PartitionableListState<?>> entry : registeredOperatorStatesDeepCopies.entrySet()) { PartitionableListState<?> value = entry.getValue(); long[] partitionOffsets = value.write(localOut); OperatorStateHandle.Mode mode = value.getStateMetaInfo().getAssignmentMode(); writtenStatesMetaData.put( entry.getKey(), new OperatorStateHandle.StateMetaInfo(partitionOffsets, mode)); } // … and the broadcast states themselves … for (Map.Entry<String, BackendWritableBroadcastState<?, ?>> entry : registeredBroadcastStatesDeepCopies.entrySet()) { BackendWritableBroadcastState<?, ?> value = entry.getValue(); long[] partitionOffsets = {value.write(localOut)}; OperatorStateHandle.Mode mode = value.getStateMetaInfo().getAssignmentMode(); writtenStatesMetaData.put( entry.getKey(), new OperatorStateHandle.StateMetaInfo(partitionOffsets, mode)); } // … and, finally, create the state handle. OperatorStateHandle retValue = null; if (unregisterCloseableFromCancellation(localOut)) { StreamStateHandle stateHandle = localOut.closeAndGetHandle(); if (stateHandle != null) { retValue = new OperatorStreamStateHandle(writtenStatesMetaData, stateHandle); } return SnapshotResult.of(retValue); } else { throw new IOException(“Stream was already unregistered.”); } } @Override protected void cleanupProvidedResources() { // nothing to do } @Override protected void logAsyncSnapshotComplete(long startTime) { if (asynchronousSnapshots) { logAsyncCompleted(streamFactory, startTime); } } }; final FutureTask<SnapshotResult<OperatorStateHandle>> task = snapshotCallable.toAsyncSnapshotFutureTask(closeStreamOnCancelRegistry); if (!asynchronousSnapshots) { task.run(); } return task; } }DefaultOperatorStateBackendSnapshotStrategy继承了AbstractSnapshotStrategy，它实现的snapshot方法主要是创建registeredOperatorStatesDeepCopies及registeredBroadcastStatesDeepCopies，然后通过AsyncSnapshotCallable来实现AsyncSnapshotCallable抽象类实现了Callable接口的call方法，该方法会调用callInternal方法，然后再执行logAsyncSnapshotComplete方法AsyncSnapshotCallable的callInternal方法返回的是SnapshotResult<OperatorStateHandle>，它里头主要是将registeredOperatorStatesDeepCopies及registeredBroadcastStatesDeepCopies的数据写入到CheckpointStreamFactory(比如MemCheckpointStreamFactory).CheckpointStateOutputStream及writtenStatesMetaData，最后通过CheckpointStateOutputStream的closeAndGetHandle返回的stateHandle及writtenStatesMetaData创建OperatorStreamStateHandle返回小结OperatorStateBackend接口继承了OperatorStateStore、Snapshotable、Closeable、Disposable接口OperatorStateStore定义了getBroadcastState、getListState、getUnionListState方法用于create或restore BroadcastState或者ListState；同时也定义了getRegisteredStateNames、getRegisteredBroadcastStateNames用于返回当前注册的state的名称；DefaultOperatorStateBackend实现了OperatorStateStore接口，getRegisteredStateNames方法返回的是registeredOperatorStates.keySet()；getRegisteredBroadcastStateNames方法返回的是registeredBroadcastStates.keySet()(registeredOperatorStates及registeredBroadcastStates这两个都是内存的Map)；getListState及getUnionListState方法都调用了getListState(ListStateDescriptor<S> stateDescriptor,OperatorStateHandle.Mode mode)方法Snapshotable接口继承了SnapshotStrategy接口，同时定义了restore方法用于restore state；SnapshotStrategy定义了snapshot方法，给不同的snapshot策略去实现，这里要求snapshot结果返回的类型是StateObject类型；AbstractSnapshotStrategy是个抽象类，它没有实现SnapshotStrategy定义的snapshot方法，这里只是提供了logSyncCompleted方法打印debug信息DefaultOperatorStateBackend实现了Snapshotable接口，snapshot方法使用的snapshotStrategy是DefaultOperatorStateBackendSnapshotStrategy；DefaultOperatorStateBackendSnapshotStrategy继承了AbstractSnapshotStrategy，它实现的snapshot方法主要是创建registeredOperatorStatesDeepCopies及registeredBroadcastStatesDeepCopies，然后通过AsyncSnapshotCallable来实现，它里头主要是将registeredOperatorStatesDeepCopies及registeredBroadcastStatesDeepCopies的数据写入到CheckpointStreamFactory(比如MemCheckpointStreamFactory).CheckpointStateOutputStream及writtenStatesMetaDataSnapshotable接口要求source的泛型为StateObject类型，StateObject继承了Serializable接口，因为会通过rpc在JobManager及TaskManager之间进行传输；OperatorStateBackend继承Snapshotable接口时，指定source为SnapshotResult<OperatorStateHandle>，而result的为Collection<OperatorStateHandle>类型StreamStateHandle继承了StateObject接口，多定义了openInputStream方法；OperatorStateHandle继承了StreamStateHandle，它多定义了getStateNameToPartitionOffsets、getDelegateStateHandle方法，其中getStateNameToPartitionOffsets提供了state name到可用partitions的offset的映射信息；OperatorStreamStateHandle实现了OperatorStateHandle接口，它定义了stateNameToPartitionOffsets属性(Map<String,StateMetaInfo>)，而getStateNameToPartitionOffsets方法就是返回的stateNameToPartitionOffsets属性SnapshotResult类实现了StateObject接口，它包装了snapshot的结果，这里包括jobManagerOwnedSnapshot、taskLocalSnapshot；它实现的discardState方法，调用了jobManagerOwnedSnapshot及taskLocalSnapshot的discardState方法；getStateSize方法则返回的是jobManagerOwnedSnapshot的stateSizedocState Backends ...