弹幕关键词排行

7 years ago · e7ae75acbf
parent 09b53c1bb5
commit e7ae75acbf
4 changed files with 175 additions and 10 deletions
--- a/PixivSearch/dao/bangumi.py
+++ b/PixivSearch/dao/bangumi.py
@ -2,9 +2,15 @@ import _thread
 import json
 import math
 import os
+import random
+import threading
+
+import zlib
+
 from concurrent import futures
 from queue import Queue

+from lxml import etree
 import django
 import requests
 from bs4 import BeautifulSoup
@ -38,13 +44,10 @@ def save(params):
        req = requests.get(url, timeout=10)
    except BaseException as e:
        logger.error(repr(e))
-        save(media_id)
+        save(params)
    logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
    if req.status_code == 200:
-        tag = BeautifulSoup(req.text, 'lxml')
-        script = tag.select("script")[3].text
-        json_str = script[script.index("=") + 1:script.index("function") - 2]
-        json_obj = json.loads(json_str)
+        json_obj = getJsonText(req, 3)
        try:
            if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']:
                stat_info = json_obj['mediaInfo']['stat']
@ -59,6 +62,13 @@ def save(params):
            logger.error(repr(e))


+def getJsonText(req, index):
+    tag = BeautifulSoup(req.text, 'lxml')
+    script = tag.select("script")[index].text
+    json_str = script[script.index("=") + 1:script.index("function") - 2]
+    return json.loads(json_str)
+
+
 def get_():
    global current_mediaInfo
    return current_mediaInfo
@ -131,9 +141,107 @@ def getIds():
            continue


-def A(e):
-    logger.info(e)
+# def testA():
+# req = requests.post('https://api.bilibili.com/x/report/web/heartbeat',
+#                                               data={"aid": 29416,"cid":49052,"csrf": "c0d296db7e33085f9f4730cfee66660b"},
+#                                               cookies=_cookies)
+# print(req.status_code)
+
+_cookies = {'DedeUserID': '4372744', 'DedeUserID__ckMd5': 'e8179b74444cae8e',
+            'SESSDATA': '919b17d2%2C1524917631%2C3eede719'}
+
+
+def getCid(aid, type=True):
+    while True and aid > 0:
+        url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=%d" % aid
+        print(url)
+        req = requests.get(url, cookies=_cookies)
+        code = json.loads(req.text)["code"]
+        if code == 0:
+            req = requests.get("https://www.bilibili.com/video/av%d" % aid)
+            if req.status_code == 200:
+                json_obj = getJsonText(req, 9)
+                if "videoData" in json_obj and "pages" in json_obj['videoData'] and len(
+                        json_obj['videoData']['pages']) > 0 and "cid" in json_obj['videoData']['pages'][0]:
+                    cid = json_obj['videoData']['pages'][0]['cid']
+                    print('cid=%s' % cid)
+                    return cid
+        if type:
+            aid = aid - 1
+        else:
+            aid = aid + 1
+
+
+def getCids(aid):
+    s = {"min": getCid(aid, True), "max": getCid(aid, False)}
+    return s
+
+
+def episodeIdToCid(episode_id):
+    cids = []
+    url = "https://www.bilibili.com/bangumi/play/ep%d" % episode_id
+    print("url=%s" % url)
+    req = requests.get(url)
+    json_obj = getJsonText(req, 8)
+    if "epList" in json_obj:
+        for i in json_obj["epList"]:
+            cids.append(i['cid'])
+    return cids
+
+
+def parseXml(url):
+    print("url=%s" % url)
+    comment_selector = etree.HTML(requests.get(url).content)
+    comment_content = comment_selector.xpath('//i')
+    for comment_each in comment_content:
+        comments = comment_each.xpath('//d/text()')
+        if comments:
+            for comment in comments:
+                if comment in obj["data"]:
+                    with lock:
+                        obj["data"][comment] = obj["data"][comment] + 1
+                else:
+                    with lock:
+                        obj["data"][comment] = 1
+                if not obj["flag"]:
+                    for keyword in keywords:
+                        if keyword in comment:
+                            obj["flag"] = True
+
+
+lock = threading.Lock()  # 多线程全局资源锁
+
+
+def loadData(cids):
+    params = []
+    for cid in cids:
+        url = "https://comment.bilibili.com/rolldate,%d" % cid
+        req = requests.get(url)
+        urls = ["https://comment.bilibili.com/%d.xml" % cid]
+        if len(req.text) > 0:
+            for i in json.loads(req.text):
+                urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))
+        for url in urls:
+            params.append(url)
+    with futures.ThreadPoolExecutor(32) as executor:
+        executor.map(parseXml, params)
+    return obj
+
+
+def getCommentSort(cids, keywords_):
+    global keywords, obj
+    keywords = keywords_
+    obj = {"data": {}, "flag": False}
+    return loadData(cids)


 if __name__ == '__main__':
-    getIds()
+    # print(getCids(29416))
+    # obj = loadData(
+    #     [49052, 49053, 51525, 51526, 53407, 54180, 55295, 55296, 57255, 57256, 59288, 59289, 61559, 61560, 64034, 64035,
+    #      67024, 67025, 69284, 73333, 73334, 74024, 74025], ['穹'])
+    f = getCommentSort(episodeIdToCid(172129), ['小樱'])
+
+    # obj = loadData([34807341], [])
+    for i in sorted(f["data"].items(), key=lambda d: d[1], reverse=True)[:50]:
+        print(i)
--- a/PixivSearch/model/migrations/0001_initial.py
+++ b/PixivSearch/model/migrations/0001_initial.py
@ -0,0 +1,57 @@
+# Generated by Django 2.0 on 2018-03-24 17:02
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='bangumi_list',
+            fields=[
+                ('season_id', models.IntegerField(primary_key=True, serialize=False)),
+                ('badge', models.CharField(max_length=128)),
+                ('brief', models.CharField(max_length=128)),
+                ('copyright', models.CharField(max_length=128)),
+                ('cover', models.CharField(max_length=128)),
+                ('favorites', models.IntegerField()),
+                ('is_finish', models.IntegerField()),
+                ('newest_ep_index', models.IntegerField()),
+                ('pub_time', models.DateTimeField()),
+                ('season_status', models.IntegerField()),
+                ('title', models.CharField(max_length=128)),
+                ('total_count', models.IntegerField()),
+                ('trailer_aid', models.IntegerField()),
+            ],
+        ),
+        migrations.CreateModel(
+            name='mediaInfo',
+            fields=[
+                ('bangumi_id', models.IntegerField(primary_key=True, serialize=False)),
+                ('season_id', models.IntegerField()),
+                ('media_id', models.IntegerField()),
+                ('chn_name', models.CharField(max_length=128)),
+            ],
+        ),
+        migrations.CreateModel(
+            name='param',
+            fields=[
+                ('param_name', models.CharField(max_length=10, primary_key=True, serialize=False)),
+                ('param_value', models.CharField(max_length=128)),
+            ],
+        ),
+        migrations.CreateModel(
+            name='stat',
+            fields=[
+                ('id', models.IntegerField(primary_key=True, serialize=False)),
+                ('danmakus', models.IntegerField()),
+                ('favorites', models.IntegerField()),
+                ('views', models.IntegerField()),
+            ],
+        ),
+    ]
--- a/PixivSearch/model/migrations/init.py
+++ b/PixivSearch/model/migrations/init.py
--- a/PixivSearch/settings.py
+++ b/PixivSearch/settings.py
@ -78,8 +78,8 @@ DATABASES = {
    'default': {
        'ENGINE': 'django.db.backends.mysql',
        'NAME': 'bangumi',
-        'USER': 'bilibili',
-        'PASSWORD': '2233',
+        'USER': 'root',
+        'PASSWORD': 'Luffy9412!',
        # 'HOST': '127.0.0.1',
        'HOST': 'mikuhime.xyz',
        'PORT': '3306',