聊聊flink的Evictors

36次阅读

共计 7041 个字符,预计需要花费 18 分钟才能阅读完成。


本文主要研究一下 flink 的 Evictors
Evictor
flink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/evictors/Evictor.java
@PublicEvolving
public interface Evictor<T, W extends Window> extends Serializable {

void evictBefore(Iterable<TimestampedValue<T>> elements, int size, W window, EvictorContext evictorContext);

void evictAfter(Iterable<TimestampedValue<T>> elements, int size, W window, EvictorContext evictorContext);

interface EvictorContext {

long getCurrentProcessingTime();

MetricGroup getMetricGroup();

long getCurrentWatermark();
}
}
Evictor 接收两个泛型,一个是 element 的类型,一个是窗口类型;它定义了 evictBefore(在 windowing function 之前)、evictAfter(在 windowing function 之后)两个方法,它们都有 EvictorContext 参数;EvictorContext 定义了 getCurrentProcessingTime、getMetricGroup、getCurrentWatermark 方法
CountEvictor
flink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/evictors/CountEvictor.java
@PublicEvolving
public class CountEvictor<W extends Window> implements Evictor<Object, W> {
private static final long serialVersionUID = 1L;

private final long maxCount;
private final boolean doEvictAfter;

private CountEvictor(long count, boolean doEvictAfter) {
this.maxCount = count;
this.doEvictAfter = doEvictAfter;
}

private CountEvictor(long count) {
this.maxCount = count;
this.doEvictAfter = false;
}

@Override
public void evictBefore(Iterable<TimestampedValue<Object>> elements, int size, W window, EvictorContext ctx) {
if (!doEvictAfter) {
evict(elements, size, ctx);
}
}

@Override
public void evictAfter(Iterable<TimestampedValue<Object>> elements, int size, W window, EvictorContext ctx) {
if (doEvictAfter) {
evict(elements, size, ctx);
}
}

private void evict(Iterable<TimestampedValue<Object>> elements, int size, EvictorContext ctx) {
if (size <= maxCount) {
return;
} else {
int evictedCount = 0;
for (Iterator<TimestampedValue<Object>> iterator = elements.iterator(); iterator.hasNext();){
iterator.next();
evictedCount++;
if (evictedCount > size – maxCount) {
break;
} else {
iterator.remove();
}
}
}
}

public static <W extends Window> CountEvictor<W> of(long maxCount) {
return new CountEvictor<>(maxCount);
}

public static <W extends Window> CountEvictor<W> of(long maxCount, boolean doEvictAfter) {
return new CountEvictor<>(maxCount, doEvictAfter);
}
}
CountEvictor 实现了 Evictor 接口,其中 element 类型为 Object;它有两个属性,分别是 doEvictAfter、maxCount;其中 doEvictAfter 用于指定是使用 evictBefore 方法还是 evictAfter 方法;maxCount 为窗口元素个数的阈值,超出则删掉
DeltaEvictor
flink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/evictors/DeltaEvictor.java
@PublicEvolving
public class DeltaEvictor<T, W extends Window> implements Evictor<T, W> {
private static final long serialVersionUID = 1L;

DeltaFunction<T> deltaFunction;
private double threshold;
private final boolean doEvictAfter;

private DeltaEvictor(double threshold, DeltaFunction<T> deltaFunction) {
this.deltaFunction = deltaFunction;
this.threshold = threshold;
this.doEvictAfter = false;
}

private DeltaEvictor(double threshold, DeltaFunction<T> deltaFunction, boolean doEvictAfter) {
this.deltaFunction = deltaFunction;
this.threshold = threshold;
this.doEvictAfter = doEvictAfter;
}

@Override
public void evictBefore(Iterable<TimestampedValue<T>> elements, int size, W window, EvictorContext ctx) {
if (!doEvictAfter) {
evict(elements, size, ctx);
}
}

@Override
public void evictAfter(Iterable<TimestampedValue<T>> elements, int size, W window, EvictorContext ctx) {
if (doEvictAfter) {
evict(elements, size, ctx);
}
}

private void evict(Iterable<TimestampedValue<T>> elements, int size, EvictorContext ctx) {
TimestampedValue<T> lastElement = Iterables.getLast(elements);
for (Iterator<TimestampedValue<T>> iterator = elements.iterator(); iterator.hasNext();){
TimestampedValue<T> element = iterator.next();
if (deltaFunction.getDelta(element.getValue(), lastElement.getValue()) >= this.threshold) {
iterator.remove();
}
}
}

@Override
public String toString() {
return “DeltaEvictor(” + deltaFunction + “, ” + threshold + “)”;
}

public static <T, W extends Window> DeltaEvictor<T, W> of(double threshold, DeltaFunction<T> deltaFunction) {
return new DeltaEvictor<>(threshold, deltaFunction);
}

public static <T, W extends Window> DeltaEvictor<T, W> of(double threshold, DeltaFunction<T> deltaFunction, boolean doEvictAfter) {
return new DeltaEvictor<>(threshold, deltaFunction, doEvictAfter);
}
}
DeltaEvictor 实现了 Evictor 接口,它有三个属性,分别是 doEvictAfter、threshold、deltaFunction;其中 doEvictAfter 用于指定是使用 evictBefore 方法还是 evictAfter 方法;threshold 为阈值,如果 deltaFunction.getDelta 方法 (每个 element 与 lastElement 计算 delta) 算出来的值大于等于该值,则需要移除该元素
TimeEvictor
flink-streaming-java_2.11-1.7.0-sources.jar!/org/apache/flink/streaming/api/windowing/evictors/TimeEvictor.java
@PublicEvolving
public class TimeEvictor<W extends Window> implements Evictor<Object, W> {
private static final long serialVersionUID = 1L;

private final long windowSize;
private final boolean doEvictAfter;

public TimeEvictor(long windowSize) {
this.windowSize = windowSize;
this.doEvictAfter = false;
}

public TimeEvictor(long windowSize, boolean doEvictAfter) {
this.windowSize = windowSize;
this.doEvictAfter = doEvictAfter;
}

@Override
public void evictBefore(Iterable<TimestampedValue<Object>> elements, int size, W window, EvictorContext ctx) {
if (!doEvictAfter) {
evict(elements, size, ctx);
}
}

@Override
public void evictAfter(Iterable<TimestampedValue<Object>> elements, int size, W window, EvictorContext ctx) {
if (doEvictAfter) {
evict(elements, size, ctx);
}
}

private void evict(Iterable<TimestampedValue<Object>> elements, int size, EvictorContext ctx) {
if (!hasTimestamp(elements)) {
return;
}

long currentTime = getMaxTimestamp(elements);
long evictCutoff = currentTime – windowSize;

for (Iterator<TimestampedValue<Object>> iterator = elements.iterator(); iterator.hasNext();) {
TimestampedValue<Object> record = iterator.next();
if (record.getTimestamp() <= evictCutoff) {
iterator.remove();
}
}
}

private boolean hasTimestamp(Iterable<TimestampedValue<Object>> elements) {
Iterator<TimestampedValue<Object>> it = elements.iterator();
if (it.hasNext()) {
return it.next().hasTimestamp();
}
return false;
}

private long getMaxTimestamp(Iterable<TimestampedValue<Object>> elements) {
long currentTime = Long.MIN_VALUE;
for (Iterator<TimestampedValue<Object>> iterator = elements.iterator(); iterator.hasNext();){
TimestampedValue<Object> record = iterator.next();
currentTime = Math.max(currentTime, record.getTimestamp());
}
return currentTime;
}

@Override
public String toString() {
return “TimeEvictor(” + windowSize + “)”;
}

@VisibleForTesting
public long getWindowSize() {
return windowSize;
}

public static <W extends Window> TimeEvictor<W> of(Time windowSize) {
return new TimeEvictor<>(windowSize.toMilliseconds());
}

public static <W extends Window> TimeEvictor<W> of(Time windowSize, boolean doEvictAfter) {
return new TimeEvictor<>(windowSize.toMilliseconds(), doEvictAfter);
}
}
TimeEvictor 实现了 Evictor 接口,其中 element 类型为 Object;它有两个属性,分别是 doEvictAfter、windowSize;其中 doEvictAfter 用于指定是使用 evictBefore 方法还是 evictAfter 方法;windowSize 用于指定窗口的时间长度,以窗口元素最大时间戳 -windowSize 为 evictCutoff,所有 timestamp 小于等于 evictCutoff 的元素都将会被剔除
小结

Evictor 接收两个泛型,一个是 element 的类型,一个是窗口类型;它定义了 evictBefore(在 windowing function 之前)、evictAfter(在 windowing function 之后)两个方法,它们都有 EvictorContext 参数;EvictorContext 定义了 getCurrentProcessingTime、getMetricGroup、getCurrentWatermark 方法
Evictor 有几个内置的实现类,分别是 CountEvictor、DeltaEvictor、TimeEvictor;其中 CountEvictor 是按窗口元素个数来进行剔除,TimeEvictor 是按窗口长度来进行剔除,DeltaEvictor 则是根据窗口元素与 lastElement 的 delta 与指定的 threshold 对比来进行剔除
如果指定了 evictor(evictBefore)则会妨碍任何 pre-aggregation 操作,因为所有的窗口元素都会在 windowing function 计算之前先执行 evictor 操作;另外就是 flink 不保障窗口元素的顺序,也就是 evictor 如果有按窗口开头或末尾剔除元素,可能剔除的元素实际上并不是最先或最后达到的

doc
Evictors

正文完
 0