继上文: PHP应用protobuf
尽管php能序列化和反序列化,然而奈何头条不认啊,最初应用了python脚本的模式,去序列化,但很快就暴露出了问题,速度太慢!几万个设施号要序列化2小时+,当然次要的起因在于过后赶时间,是一个个设施号序列化的,大量的工夫花在python上下文切换上,上文里的脚本能用,然而不适宜略微量大一点的场景,故而用三脚猫的功夫写了一个新的python脚本,承受文件,吐出序列化后的新文件,速度大大晋升,实测大略1000/s个设施号。
from __future__ import print_functionimport DmpDataProtoV2_pb2import os,sysimport timeimport base64ag_len = sys.argv.__len__()if ag_len <= 1: print ('ag is null') exit()file = sys.argv[1]if not file.strip(): print ('files is null') exit()if not os.path.exists(file): print ('files is not exists') exit()f = open(file)line = f.readline()line=line.strip('\n')base_name = os.path.splitext(file)[0]target_file = base_name + '-ProtoBuf.txt'print(target_file)# if os.path.exists(target_file)::# os.remove(target_file)t = open(target_file, 'w')t.truncate()while line: line=line.strip('\n') if not line.strip(): continue arr = line.split('|') if arr.__len__() != 2: continue dmp_data = DmpDataProtoV2_pb2.DmpData() id_item1 = dmp_data.idList.add() dtype = arr[0] dev_id = arr[1] id_item1.dataType = getattr(DmpDataProtoV2_pb2.IdItem,dtype) #id_item1.dataType = DmpDataProtoV2_pb2.IdItem.IDFA id_item1.id = str.lower(dev_id) id_item1.tags.append(dtype) # id_item1.timestamp = int(time.time()) binary_string = dmp_data.SerializeToString() s = base64.b64encode(binary_string) t.write(s+"\n"); line = f.readline() line=line.strip('\n')f.close()
PHP调用局部
//从py重写$protobuf_path = shell_exec("python ".base_path()."/scripts/python/base64DmpItemByFile.py {$file_path}");
Done!