关于python爬虫:python-requests-爬取nexus库依赖包数据
前言因为nexus库自带API没有输入依赖包对应的创立工夫(blobCreated)、上传工夫(blobUpdated),为方便管理依赖包只好从页面返回的数据进行爬取(可能应用nexus-cli能够获取,然而没有细究这个办法) 思路我这边应用的nexus库版本为 3.19.1-01,上面是在nexus库API接口信息中能够查问到对于依赖包信息的接口 /v1/components该版本输入的信息中只蕴含个别的 group、name、downloadUrl、repository等音讯,没有上传工夫相干的信息,然而在Browse 中能够查问到相干的信息通过浏览器开发工具能够查问到,该信息来自/service/extdirect 这个申请地址 依据接口申请负载能够看到,该接口须要传入两个参数别离是repo地址以及一个id参数,认真钻研之后发现该id与nexus 自带API(/v1/components)输入的id并非一样,而是来自同一个申请地址不同action的id名为 assetId而这个申请须要传入的参数别离是repo地址以及node信息,node信息能够通过'group'+'name'+'version'信息获取。那么查问到这一步思路就造成闭环了,整体思路如下:通过nexus库自带API接口别离获取:'group'+'name'+'version' 信息,组合成node信息,进而获取到assetId信息,通过assetId信息获取对应的依赖包详细信息。既然思路清晰了,那就开始备菜做饭(敲代码调试)吧~ 实现获取node节点信息 AKA 'group'+'name'+'version'信息nexus_url = 'http://192.168.1.1:8081/' #nexus库地址nexus_username = 'admin' #nexus账号nexus_password = 'admin123' #nexus明码repository = ['aliyun','maven-releases'] #nexus库 repo地址names = []groups = []versions = []downloadUrls = [] #这里顺便获取下载地址,没太大用处能够疏忽asset_ids = []def components_api(repository): #申请components接口获取依赖包包根本信息 print('running components_api\t'+repository) #申请参数 query_params = { 'repository' : repository, #传入的repo信息 'extension' : 'jar' #依赖包类型 } #申请API接口 response = requests.get( nexus_url + '/service/rest/v1/components', params=query_params, auth=(nexus_username, nexus_password) #API接口须要鉴权 ) #判断接口状态 if response.status_code != 200: print('获取依赖项信息失败,状态码:{}'.format(response.status_code)) print(response.content) exit() #将接口返回数据转化为json格局用于解决 response_data = response.json() # print(response_data) global name,group,version,downloadUrl for items in response_data['items']: name = format(items['name']) group = format(items['group']) version = format(items['version']) names.append(name) groups.append(group) versions.append(version) for asset in items['assets']: if asset['path'].endswith('.jar'): downloadUrl = format(asset['downloadUrl']) downloadUrls.append(downloadUrl) #执行工夫 end = datetime.datetime.now() print('components_api totally time is ' , end - start)依据node信息获取assetIdheaders = { #申请头信息 nexus库,cookie信息及token信息依据nexus库账号获取 'X-Requested-With': 'XMLHttpRequest', 'X-Nexus-UI': 'true', 'NX-ANTI-CSRF-TOKEN': '0.1427667210133794', 'Content-Type': 'application/json', 'Origin': nexus_url, 'Referer': nexus_url, 'Cookie': 'NX-ANTI-CSRF-TOKEN=0.1427667210133794; NXSESSIONID=9b2057c6-a456-4f94-85d7-90dcb07eb9e6' }random_number = random.randint(50, 120) #增加一个随机数用于接口操作的tiddef get_nodeinfo(repository): #依据components_api获取的信息拼接取得node门路及asset_id print('get_nodeinfo\t'+repository) api_url = nexus_url + "/service/extdirect" for i in range(len(names)): Groups = [group.replace('.', '/') for group in groups] #将groups中的‘.’转换为‘/’ node = Groups[i]+'/'+names[i]+'/'+versions[i] payload = json.dumps({ "action": "coreui_Browse", "method": "read", "data": [{ "repositoryName" : repository, "node": str(node) }], "type": "rpc", "tid": random_number }) response = requests.request("POST", api_url, headers=headers, data=payload) if response.status_code != 200: #判断接口状态 print('获取依赖项信息失败,状态码:{}'.format(response.status_code)) print(response.content) exit() response_data = response.json() global asset_ids for item in response_data['result']['data']: #依据接口返回数据获取assetid if item['text'].endswith('.jar'): asset_ids.append(item['assetId']) #将API数据中的assetId存入变量asset_ids #执行工夫 end = datetime.datetime.now() print('get_nodeinfo totally time is ' , end - start)依据assetId获取依赖包详细信息INFO_NAME = []INFO_REPO = []INFO_CTIME = []INFO_UTIME = []INFO_DURL = []INFO_GID = []INFO_AID = []INFO_VERSION = []def get_assetid_info(repository): #通过assetid获取jar具体信息包含上传工夫及创立工夫 api_url = nexus_url + "/service/extdirect" for i in range(len(asset_ids)): aid = asset_ids[i] payload = json.dumps({ "action": "coreui_Component", "method": "readAsset", "data": [ aid, repository ], "type": "rpc", "tid": random_number }) response = requests.request("POST", api_url, headers=headers, data=payload) global INFO_NAME,INFO_REPO,INFO_CTIME,INFO_UTIME,INFO_DURL,INFO_GID,INFO_AID,INFO_VERSION response_data = response.json() if response.status_code != 200: print('get_assetid_info 获取依赖项信息失败,状态码:{}'.format(response.status_code)) if response_data['result']['success'] != True: print('get_assetid_info 获取依赖项信息失败,状态码:{}'.format(response.status_code)) else: item = response_data['result']['data'] durls = nexus_url+'/repository/'+repository+'/'+item['name'] #输入信息 print('包名:'+item['name'],'\n'+ '所在repo:',item['repositoryName'],'\n'+'创立工夫:',item['blobCreated'],'\n'+'更新工夫:',item['blobUpdated'],'\n'+'下载地址:',durls) #将信息增加到汇合用于后续解决入库 INFO_NAME.append(item['name']) INFO_REPO.append(item['repositoryName']) INFO_CTIME.append(item['blobCreated']) INFO_UTIME.append(item['blobUpdated']) INFO_DURL.append(durls) INFO_GID.append(item['attributes']['maven2']['groupId']) INFO_AID.append(item['attributes']['maven2']['artifactId']) INFO_VERSION.append(item['attributes']['maven2']['version']) #执行工夫 end = datetime.datetime.now() print('get_assetid_info totally time is ' ,end - start)成果执行成果: ...