弹幕关键词排行

master
10295 6 years ago
parent 09b53c1bb5
commit e7ae75acbf
  1. 124
      PixivSearch/dao/bangumi.py
  2. 57
      PixivSearch/model/migrations/0001_initial.py
  3. 0
      PixivSearch/model/migrations/__init__.py
  4. 4
      PixivSearch/settings.py

@ -2,9 +2,15 @@ import _thread
import json import json
import math import math
import os import os
import random
import threading
import zlib
from concurrent import futures from concurrent import futures
from queue import Queue from queue import Queue
from lxml import etree
import django import django
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -38,13 +44,10 @@ def save(params):
req = requests.get(url, timeout=10) req = requests.get(url, timeout=10)
except BaseException as e: except BaseException as e:
logger.error(repr(e)) logger.error(repr(e))
save(media_id) save(params)
logger.info("request_url=%s,status_code=%d" % (url, req.status_code)) logger.info("request_url=%s,status_code=%d" % (url, req.status_code))
if req.status_code == 200: if req.status_code == 200:
tag = BeautifulSoup(req.text, 'lxml') json_obj = getJsonText(req, 3)
script = tag.select("script")[3].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
json_obj = json.loads(json_str)
try: try:
if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']: if 'mediaInfo' in json_obj and 'stat' in json_obj['mediaInfo'] and 'chn_name' in json_obj['mediaInfo']:
stat_info = json_obj['mediaInfo']['stat'] stat_info = json_obj['mediaInfo']['stat']
@ -59,6 +62,13 @@ def save(params):
logger.error(repr(e)) logger.error(repr(e))
def getJsonText(req, index):
tag = BeautifulSoup(req.text, 'lxml')
script = tag.select("script")[index].text
json_str = script[script.index("=") + 1:script.index("function") - 2]
return json.loads(json_str)
def get_(): def get_():
global current_mediaInfo global current_mediaInfo
return current_mediaInfo return current_mediaInfo
@ -131,9 +141,107 @@ def getIds():
continue continue
def A(e): # def testA():
logger.info(e) # req = requests.post('https://api.bilibili.com/x/report/web/heartbeat',
# data={"aid": 29416,"cid":49052,"csrf": "c0d296db7e33085f9f4730cfee66660b"},
# cookies=_cookies)
# print(req.status_code)
_cookies = {'DedeUserID': '4372744', 'DedeUserID__ckMd5': 'e8179b74444cae8e',
'SESSDATA': '919b17d2%2C1524917631%2C3eede719'}
def getCid(aid, type=True):
while True and aid > 0:
url = "https://api.bilibili.com/x/web-interface/archive/stat?aid=%d" % aid
print(url)
req = requests.get(url, cookies=_cookies)
code = json.loads(req.text)["code"]
if code == 0:
req = requests.get("https://www.bilibili.com/video/av%d" % aid)
if req.status_code == 200:
json_obj = getJsonText(req, 9)
if "videoData" in json_obj and "pages" in json_obj['videoData'] and len(
json_obj['videoData']['pages']) > 0 and "cid" in json_obj['videoData']['pages'][0]:
cid = json_obj['videoData']['pages'][0]['cid']
print('cid=%s' % cid)
return cid
if type:
aid = aid - 1
else:
aid = aid + 1
def getCids(aid):
s = {"min": getCid(aid, True), "max": getCid(aid, False)}
return s
def episodeIdToCid(episode_id):
cids = []
url = "https://www.bilibili.com/bangumi/play/ep%d" % episode_id
print("url=%s" % url)
req = requests.get(url)
json_obj = getJsonText(req, 8)
if "epList" in json_obj:
for i in json_obj["epList"]:
cids.append(i['cid'])
return cids
def parseXml(url):
print("url=%s" % url)
comment_selector = etree.HTML(requests.get(url).content)
comment_content = comment_selector.xpath('//i')
for comment_each in comment_content:
comments = comment_each.xpath('//d/text()')
if comments:
for comment in comments:
if comment in obj["data"]:
with lock:
obj["data"][comment] = obj["data"][comment] + 1
else:
with lock:
obj["data"][comment] = 1
if not obj["flag"]:
for keyword in keywords:
if keyword in comment:
obj["flag"] = True
lock = threading.Lock() # 多线程全局资源锁
def loadData(cids):
params = []
for cid in cids:
url = "https://comment.bilibili.com/rolldate,%d" % cid
req = requests.get(url)
urls = ["https://comment.bilibili.com/%d.xml" % cid]
if len(req.text) > 0:
for i in json.loads(req.text):
urls.append("https://comment.bilibili.com/dmroll,%s,%d" % (i['timestamp'], cid))
for url in urls:
params.append(url)
with futures.ThreadPoolExecutor(32) as executor:
executor.map(parseXml, params)
return obj
def getCommentSort(cids, keywords_):
global keywords, obj
keywords = keywords_
obj = {"data": {}, "flag": False}
return loadData(cids)
if __name__ == '__main__': if __name__ == '__main__':
getIds() # print(getCids(29416))
# obj = loadData(
# [49052, 49053, 51525, 51526, 53407, 54180, 55295, 55296, 57255, 57256, 59288, 59289, 61559, 61560, 64034, 64035,
# 67024, 67025, 69284, 73333, 73334, 74024, 74025], ['穹'])
f = getCommentSort(episodeIdToCid(172129), ['小樱'])
# obj = loadData([34807341], [])
for i in sorted(f["data"].items(), key=lambda d: d[1], reverse=True)[:50]:
print(i)

@ -0,0 +1,57 @@
# Generated by Django 2.0 on 2018-03-24 17:02
from django.db import migrations, models
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='bangumi_list',
fields=[
('season_id', models.IntegerField(primary_key=True, serialize=False)),
('badge', models.CharField(max_length=128)),
('brief', models.CharField(max_length=128)),
('copyright', models.CharField(max_length=128)),
('cover', models.CharField(max_length=128)),
('favorites', models.IntegerField()),
('is_finish', models.IntegerField()),
('newest_ep_index', models.IntegerField()),
('pub_time', models.DateTimeField()),
('season_status', models.IntegerField()),
('title', models.CharField(max_length=128)),
('total_count', models.IntegerField()),
('trailer_aid', models.IntegerField()),
],
),
migrations.CreateModel(
name='mediaInfo',
fields=[
('bangumi_id', models.IntegerField(primary_key=True, serialize=False)),
('season_id', models.IntegerField()),
('media_id', models.IntegerField()),
('chn_name', models.CharField(max_length=128)),
],
),
migrations.CreateModel(
name='param',
fields=[
('param_name', models.CharField(max_length=10, primary_key=True, serialize=False)),
('param_value', models.CharField(max_length=128)),
],
),
migrations.CreateModel(
name='stat',
fields=[
('id', models.IntegerField(primary_key=True, serialize=False)),
('danmakus', models.IntegerField()),
('favorites', models.IntegerField()),
('views', models.IntegerField()),
],
),
]

@ -78,8 +78,8 @@ DATABASES = {
'default': { 'default': {
'ENGINE': 'django.db.backends.mysql', 'ENGINE': 'django.db.backends.mysql',
'NAME': 'bangumi', 'NAME': 'bangumi',
'USER': 'bilibili', 'USER': 'root',
'PASSWORD': '2233', 'PASSWORD': 'Luffy9412!',
# 'HOST': '127.0.0.1', # 'HOST': '127.0.0.1',
'HOST': 'mikuhime.xyz', 'HOST': 'mikuhime.xyz',
'PORT': '3306', 'PORT': '3306',

Loading…
Cancel
Save