共计 8374 个字符,预计需要花费 21 分钟才能阅读完成。
线上有一个音讯生产服务 xxx-consumer,应用 spring-kafka 框架,主线程批量从生产队列(kafka)拉取交易系统生产的音讯,而后提交到子线程池中挨个解决生产。
public abstract class AbstractMessageDispatchListener implements | |
BatchAcknowledgingMessageListener<String, Msg>, ApplicationListener<ApplicationReadyEvent> { | |
| |
private ThreadPoolExecutor executor; | |
| |
public abstract MessageWorker chooseWorker(ConsumerRecord<String, Msg> data); | |
| |
@Override | |
public void onMessage(List<ConsumerRecord<String, Msg>> datas, Acknowledgment acknowledgment) {List<Future<?>> futureList = new ArrayList<>(datas.size()); | |
try {CountDownLatch countDownLatch = new CountDownLatch(datas.size()); | |
for (ConsumerRecord<String, Msg> data : datas) {Future<?> future = executor.submit(new Worker(data, countDownLatch)); | |
futureList.add(future); | |
} | |
| |
countDownLatch.await(20000L - 2000, TimeUnit.MILLISECONDS); | |
long countDownLatchCount = countDownLatch.getCount(); | |
if (countDownLatchCount > 0) {return;} | |
acknowledgment.acknowledge();} catch (Exception e) {logger.error("onMessage error", e); | |
} finally {for (Future<?> future : futureList) {if (future.isDone() || future.isCancelled()) {continue;} | |
future.cancel(true); | |
} | |
} | |
} | |
| |
@Override | |
public void onApplicationEvent(ApplicationReadyEvent event) {ThreadFactoryBuilder builder = new ThreadFactoryBuilder(); | |
builder.setNameFormat(this.getClass().getSimpleName() + "-pool-%d"); | |
builder.setDaemon(false); | |
executor = new ThreadPoolExecutor(12, | |
12 * 2, | |
60L, | |
TimeUnit.SECONDS, | |
new ArrayBlockingQueue<>(100), | |
builder.build()); | |
} | |
| |
private class Worker implements Runnable { | |
private ConsumerRecord<String, Msg> data; | |
private CountDownLatch countDownLatch; | |
| |
Worker(ConsumerRecord<String, Msg> data, CountDownLatch countDownLatch) { | |
this.data = data; | |
this.countDownLatch = countDownLatch; | |
} | |
| |
@Override | |
public void run() { | |
try {MessageWorker worker = chooseWorker(data); | |
worker.work(data.value()); | |
} finally {countDownLatch.countDown(); | |
} | |
} | |
} | |
} |
1. 问题背景
有一天早上 xxx-consumer 服务呈现大量报警,人工排查发现 30w+ 的音讯未解决,业务日志失常,gc 日志有大量 Full gc,初步判断因为 Full gc 导致音讯解决慢,大量的音讯积压。
2. 堆栈剖析
查看了近一个月的 JVM 内存信息,发现老年代内存无奈被回收(9 月 22 号的降落是因为服务有一次上线重启),初步判断产生了内存透露。
通过 <jmap -dump:format=b,file=/home/work/app/xxx-consumer/logs/jmap_dump.hprof -F> 命令导出内存快照,应用 Memory Analyzer 解析内存快照文件 jmap_dump.hprof,发现有很显著的内存透露提醒:
进一步查看线程细节,发现创立了大量的 ThreadLocalScope 对象且循环援用:
同时咱们也看到了分布式追踪(dd-trace-java)jar 包中的 FakeSpan 类,初步判断是 dd-trace-java 中自研扩大的 kafka 插件存在内存透露 bug。
3. 代码剖析
持续查看 dd-trace-java 中 kafka 插件的代码,其解决流程如下:
第一批音讯
- (SpringKafkaConsumerInstrumentation:L22)BatchAcknowledgingMessageListener.onMessage 进入时,主线程会创立一个 scope00=ThreadLocalScope(Type_BatchMessageListener_Value,toRestore=null)
- (ExecutorInstrumentation:L21L47)音讯被 submit 到线程池中解决时,子线程会创立一个 scope10=ThreadLocalScope(Type_BatchMessageListener_Value,toRestore=null)
- (SpringKafkaConsumerInstrumentation:L68)子线程解决音讯时(ConsumerRecord.value),会创立一个 scope11=ThreadLocalScope(Type_ConsumberRecord_Value,toRestore=scope10)
- (ExecutorInstrumentation:L54)子线程解决完音讯后,执行 scope10.close(),而 scopeManager.tlsScope.get()=scope11,命中 ThreadLocalScope:L19,scope10 和 scope11 均无奈被 GC
- (SpringKafkaConsumerInstrumentation:L42)BatchAcknowledgingMessageListener.onMessage 退出时,主线程会执行 scope00.close(),scope00 会被 GC
第二批音讯
- (SpringKafkaConsumerInstrumentation:L22)BatchAcknowledgingMessageListener.onMessage 进入时,主线程会创立一个 scope01=ThreadLocalScope(Type_BatchMessageListener_Value,toRestore=null)
- (ExecutorInstrumentation:L21L47)音讯被 submit 到线程池中解决时,子线程会创立一个 scope12=ThreadLocalScope(Type_BatchMessageListener_Value,toRestore=scope11)
- (SpringKafkaConsumerInstrumentation:L68)子线程解决音讯时(ConsumerRecord.value),会创立一个 scope13=ThreadLocalScope(Type_ConsumberRecord_Value,toRestore=scope12)
- (ExecutorInstrumentation:L54)子线程解决完音讯后,执行 scope12.close(),而 scopeManager.tlsScope.get()=scope13,命中 ThreadLocalScope:L19,scope12 和 scope13 均无奈被 GC
- (SpringKafkaConsumerInstrumentation:L42)BatchAcknowledgingMessageListener.onMessage 退出时,主线程会执行 scope01.close(),scope01 会被 GC
从上能够看到,主线程创立的 ThreadLocalScope 能被正确 GC,而线程池中创立的 ThreadLocalScope 被循环援用,无奈被正确 GC,从而造成内存透露。
@AutoService(Instrumenter.class) | |
public final class SpringKafkaConsumerInstrumentation extends Instrumenter.Configurable { | |
@Override | |
public AgentBuilder apply(final AgentBuilder agentBuilder) { | |
return agentBuilder | |
.type(hasSuperType(named("org.springframework.kafka.listener.BatchAcknowledgingMessageListener"))) | |
.transform(DDAdvice.create().advice(isMethod().and(isPublic()).and(named("onMessage")), | |
BatchMessageListenerAdvice.class.getName())) | |
.type(named("org.apache.kafka.clients.consumer.ConsumerRecord")) | |
.transform(DDAdvice.create().advice(isMethod().and(isPublic()).and(named("value")), | |
RecoredValueAdvice.class.getName())) | |
.asDecorator();} | |
public static class BatchMessageListenerAdvice {@Advice.OnMethodEnter(suppress = Throwable.class) | |
public static Scope before() {FakeSpan span = new FakeSpan(); | |
span.setKind(FakeSpan.Type_BatchMessageListener_Value); | |
Scope scope = GlobalTracer.get().scopeManager().activate(span, false); | |
return scope; | |
} | |
@Advice.OnMethodExit(suppress = Throwable.class) | |
public static void after(@Advice.Enter Scope scope) {while (true) {Span span = GlobalTracer.get().activeSpan(); | |
if (span != null && span instanceof FakeSpan) {FakeSpan fakeSpan = (FakeSpan) span; | |
if (fakeSpan.getKind().equals(FakeSpan.Type_ConsumberRecord_Value)) {GlobalTracer.get().scopeManager().active().close();} else {break;} | |
} else {break;} | |
} | |
if (scope != null) {scope.close(); | |
} | |
} | |
} | |
public static class RecoredValueAdvice {@Advice.OnMethodEnter(suppress = Throwable.class) | |
public static void before(@Advice.This ConsumerRecord record) {Span activeSpan = GlobalTracer.get().activeSpan(); | |
if (activeSpan instanceof FakeSpan) {FakeSpan proxy = (FakeSpan) activeSpan; | |
if (proxy.getKind().equals(FakeSpan.Type_ConsumberRecord_Value)) {GlobalTracer.get().scopeManager().active().close(); | |
activeSpan = GlobalTracer.get().activeSpan(); | |
if (activeSpan instanceof FakeSpan) {proxy = (FakeSpan) activeSpan; | |
} | |
} | |
if (proxy.getKind().equals(FakeSpan.Type_BatchMessageListener_Value)) {final SpanContext spanContext = TracingKafkaUtils.extractSecond(record.headers(), GlobalTracer.get()); | |
if (spanContext != null) {FakeSpan consumerProxy = new FakeSpan(); | |
consumerProxy.setContext(spanContext); | |
consumerProxy.setKind(FakeSpan.Type_ConsumberRecord_Value); | |
GlobalTracer.get().scopeManager().activate(consumerProxy, false); | |
} | |
} | |
} | |
} | |
} | |
} |
@AutoService(Instrumenter.class) | |
public final class ExecutorInstrumentation extends Instrumenter.Configurable { | |
@Override | |
public AgentBuilder apply(final AgentBuilder agentBuilder) { | |
return agentBuilder | |
.type(not(isInterface()).and(hasSuperType(named(ExecutorService.class.getName())))) | |
.transform(DDAdvice.create().advice(named("submit").and(takesArgument(0, Runnable.class)), | |
SubmitTracedRunnableAdvice.class.getName())) | |
.asDecorator();} | |
public static class SubmitTracedRunnableAdvice {@Advice.OnMethodEnter(suppress = Throwable.class) | |
public static TracedRunnable wrapJob( | |
@Advice.This Object dis, | |
@Advice.Argument(value = 0, readOnly = false) Runnable task) {if (task != null && (!dis.getClass().getName().startsWith("slick.util.AsyncExecutor"))) {task = new TracedRunnable(task, GlobalTracer.get()); | |
return (TracedRunnable) task; | |
} | |
return null; | |
} | |
} | |
public static class TracedRunnable implements Runnable { | |
private final Runnable delegate; | |
private final Span span; | |
private final Tracer tracer; | |
public TracedRunnable(Runnable delegate, Tracer tracer) { | |
this.delegate = delegate; | |
this.tracer = tracer; | |
if (tracer != null) {this.span = tracer.activeSpan(); | |
} else {this.span = null;} | |
} | |
@Override | |
public void run() { | |
Scope scope = null; | |
if (span != null && tracer != null) {scope = tracer.scopeManager().activate(span, false); | |
} | |
try {delegate.run(); | |
} finally {if (scope != null) {scope.close(); | |
} | |
} | |
} | |
} | |
} |
public class ThreadLocalScopeManager implements ScopeManager {final ThreadLocal<ThreadLocalScope> tlsScope = new ThreadLocal<ThreadLocalScope>(); | |
@Override | |
public Scope activate(Span span, boolean finishOnClose) {return new ThreadLocalScope(this, span, finishOnClose); | |
} | |
@Override | |
public Scope active() {return tlsScope.get(); | |
} | |
} |
public class ThreadLocalScope implements Scope { | |
private final ThreadLocalScopeManager scopeManager; | |
private final Span wrapped; | |
private final boolean finishOnClose; | |
private final ThreadLocalScope toRestore; | |
ThreadLocalScope(ThreadLocalScopeManager scopeManager, Span wrapped, boolean finishOnClose) { | |
this.scopeManager = scopeManager; | |
this.wrapped = wrapped; | |
this.finishOnClose = finishOnClose; | |
this.toRestore = scopeManager.tlsScope.get(); | |
scopeManager.tlsScope.set(this); | |
} | |
@Override | |
public void close() {if (scopeManager.tlsScope.get() != this) { | |
// This shouldn't happen if users call methods in the expected order. Bail out. | |
return; | |
} | |
if (finishOnClose) {wrapped.finish(); | |
} | |
scopeManager.tlsScope.set(toRestore); | |
} | |
@Override | |
public Span span() {return wrapped;} | |
} |
End
RecoredValueAdvice 没有销毁本人创立的对象,而是寄希望于 BatchMessageListenerAdvice 去销毁。
但(SpringKafkaConsumerInstrumentation:L27)BatchAcknowledgingMessageListener.onMessage 退出时,只会 close 主线程创立的 ThreadLocalScope,不会 close 线程池中创立的 ThreadLocalScope,导致子线程创立的 ThreadLocalScope 被循环援用,无奈被正确 GC,从而造成内存透露。