From e7ae75acbfdf6dccda8e6e3364deee5f03e15ee7 Mon Sep 17 00:00:00 2001 From: 10295 <1029559041@qq.com> Date: Sat, 31 Mar 2018 22:00:08 +0800 Subject: [PATCH] =?UTF-8?q?=E5=BC=B9=E5=B9=95=E5=85=B3=E9=94=AE=E8=AF=8D?= =?UTF-8?q?=E6=8E=92=E8=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- PixivSearch/dao/bangumi.py | 124 +++++++++++++++++-- PixivSearch/model/migrations/0001_initial.py | 57 +++++++++ PixivSearch/model/migrations/__init__.py | 0 PixivSearch/settings.py | 4 +- 4 files changed, 175 insertions(+), 10 deletions(-) create mode 100644 PixivSearch/model/migrations/0001_initial.py create mode 100644 PixivSearch/model/migrations/__init__.py diff --git a/PixivSearch/dao/bangumi.py b/PixivSearch/dao/bangumi.py index a66c738..40e4694 100644 --- a/PixivSearch/dao/bangumi.py +++ b/PixivSearch/dao/bangumi.py @@ -2,9 +2,15 @@ import _thread import json import math import os +import random +import threading + +import zlib + from concurrent import futures from queue import Queue +from lxml import etree import django import requests from bs4 import BeautifulSoup @@ -38,13 +44,10 @@ def save(params): req = requests.get(url, timeout=10) except BaseException as e: logger.error(repr(e)) - save(media_id) + save(params) logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) if req.status_code == 200: - tag = BeautifulSoup(req.text, 'lxml') - script = tag.select("script")[3].text - json_str = script[script.index("=") + 1:script.index("function") - 2] - json_obj = json.loads(json_str) + json_obj = getJsonText(req, 3) try: if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']: stat_info = json_obj['mediaInfo']['stat'] @@ -59,6 +62,13 @@ def save(params): logger.error(repr(e)) +def getJsonText(req, index): + tag = BeautifulSoup(req.text, 'lxml') + script = tag.select("script")[index].text + json_str = script[script.index("=") + 1:script.index("function") - 2] + return json.loads(json_str) + + def get_(): global current_mediaInfo return current_mediaInfo @@ -131,9 +141,107 @@ def getIds(): continue -def A(e): - logger.info(e) +# def testA(): +# req = requests.post('https://api.bilibili.com/x/report/web/heartbeat', +# data={"aid": 29416,"cid":49052,"csrf": "c0d296db7e33085f9f4730cfee66660b"}, +# cookies=_cookies) +# print(req.status_code) + +_cookies = {'DedeUserID': '4372744', 'DedeUserID__ckMd5': 'e8179b74444cae8e', + 'SESSDATA': '919b17d2%2C1524917631%2C3eede719'} + + +def getCid(aid, type=True): + while True and aid > 0: + url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=%d" % aid + print(url) + req = requests.get(url, cookies=_cookies) + code = json.loads(req.text)["code"] + if code == 0: + req = requests.get("https://www.bilibili.com/video/av%d" % aid) + if req.status_code == 200: + json_obj = getJsonText(req, 9) + if "videoData" in json_obj and "pages" in json_obj['videoData'] and len( + json_obj['videoData']['pages']) > 0 and "cid" in json_obj['videoData']['pages'][0]: + cid = json_obj['videoData']['pages'][0]['cid'] + print('cid=%s' % cid) + return cid + if type: + aid = aid - 1 + else: + aid = aid + 1 + + +def getCids(aid): + s = {"min": getCid(aid, True), "max": getCid(aid, False)} + return s + + +def episodeIdToCid(episode_id): + cids = [] + url = "https://www.bilibili.com/bangumi/play/ep%d" % episode_id + print("url=%s" % url) + req = requests.get(url) + json_obj = getJsonText(req, 8) + if "epList" in json_obj: + for i in json_obj["epList"]: + cids.append(i['cid']) + return cids + + +def parseXml(url): + print("url=%s" % url) + comment_selector = etree.HTML(requests.get(url).content) + comment_content = comment_selector.xpath('//i') + for comment_each in comment_content: + comments = comment_each.xpath('//d/text()') + if comments: + for comment in comments: + if comment in obj["data"]: + with lock: + obj["data"][comment] = obj["data"][comment] + 1 + else: + with lock: + obj["data"][comment] = 1 + if not obj["flag"]: + for keyword in keywords: + if keyword in comment: + obj["flag"] = True + + +lock = threading.Lock() # 多线程全局资源锁 + + +def loadData(cids): + params = [] + for cid in cids: + url = "https://comment.bilibili.com/rolldate,%d" % cid + req = requests.get(url) + urls = ["https://comment.bilibili.com/%d.xml" % cid] + if len(req.text) > 0: + for i in json.loads(req.text): + urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid)) + for url in urls: + params.append(url) + with futures.ThreadPoolExecutor(32) as executor: + executor.map(parseXml, params) + return obj + + +def getCommentSort(cids, keywords_): + global keywords, obj + keywords = keywords_ + obj = {"data": {}, "flag": False} + return loadData(cids) if __name__ == '__main__': - getIds() + # print(getCids(29416)) + # obj = loadData( + # [49052, 49053, 51525, 51526, 53407, 54180, 55295, 55296, 57255, 57256, 59288, 59289, 61559, 61560, 64034, 64035, + # 67024, 67025, 69284, 73333, 73334, 74024, 74025], ['穹']) + f = getCommentSort(episodeIdToCid(172129), ['小樱']) + + # obj = loadData([34807341], []) + for i in sorted(f["data"].items(), key=lambda d: d[1], reverse=True)[:50]: + print(i) diff --git a/PixivSearch/model/migrations/0001_initial.py b/PixivSearch/model/migrations/0001_initial.py new file mode 100644 index 0000000..807f4da --- /dev/null +++ b/PixivSearch/model/migrations/0001_initial.py @@ -0,0 +1,57 @@ +# Generated by Django 2.0 on 2018-03-24 17:02 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='bangumi_list', + fields=[ + ('season_id', models.IntegerField(primary_key=True, serialize=False)), + ('badge', models.CharField(max_length=128)), + ('brief', models.CharField(max_length=128)), + ('copyright', models.CharField(max_length=128)), + ('cover', models.CharField(max_length=128)), + ('favorites', models.IntegerField()), + ('is_finish', models.IntegerField()), + ('newest_ep_index', models.IntegerField()), + ('pub_time', models.DateTimeField()), + ('season_status', models.IntegerField()), + ('title', models.CharField(max_length=128)), + ('total_count', models.IntegerField()), + ('trailer_aid', models.IntegerField()), + ], + ), + migrations.CreateModel( + name='mediaInfo', + fields=[ + ('bangumi_id', models.IntegerField(primary_key=True, serialize=False)), + ('season_id', models.IntegerField()), + ('media_id', models.IntegerField()), + ('chn_name', models.CharField(max_length=128)), + ], + ), + migrations.CreateModel( + name='param', + fields=[ + ('param_name', models.CharField(max_length=10, primary_key=True, serialize=False)), + ('param_value', models.CharField(max_length=128)), + ], + ), + migrations.CreateModel( + name='stat', + fields=[ + ('id', models.IntegerField(primary_key=True, serialize=False)), + ('danmakus', models.IntegerField()), + ('favorites', models.IntegerField()), + ('views', models.IntegerField()), + ], + ), + ] diff --git a/PixivSearch/model/migrations/__init__.py b/PixivSearch/model/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/PixivSearch/settings.py b/PixivSearch/settings.py index 717f957..f584b76 100644 --- a/PixivSearch/settings.py +++ b/PixivSearch/settings.py @@ -78,8 +78,8 @@ DATABASES = { 'default': { 'ENGINE': 'django.db.backends.mysql', 'NAME': 'bangumi', - 'USER': 'bilibili', - 'PASSWORD': '2233', + 'USER': 'root', + 'PASSWORD': 'Luffy9412!', # 'HOST': '127.0.0.1', 'HOST': 'mikuhime.xyz', 'PORT': '3306',