1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
|
from datetime import datetime from bs4 import BeautifulSoup, Comment from logging.handlers import TimedRotatingFileHandler import logging import requests import json import os
datetime_text = datetime.now().strftime('%Y%m%d_%H%M%S_%s') date_text = datetime.now().strftime('%Y%m%d')
DATA_DIR = '/var/lib/ysmspace-crawler/data/' LOG_DIR = '/var/log/ysmspace-crawler/' RESPONSE_DATA_FILE = DATA_DIR + 'response.{}.html'.format(datetime_text) PURE_DATA_FILE = DATA_DIR + 'data.{}.txt'.format(datetime_text) LOG_FILE = LOG_DIR + 'message.{}.log'.format(date_text)
SDATA_STARTS = 's-data:'
os.makedirs(DATA_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True)
log = logging.getLogger(__name__) logging.basicConfig(filename=LOG_FILE, level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
log.info('define.datetime_text: %s', datetime_text) log.info('define.date_text: %s', date_text) log.info('define.DATA_DIR: %s', DATA_DIR) log.info('define.LOG_DIR: %s', LOG_DIR) log.info('define.RESPONSE_DATA_FILE: %s', RESPONSE_DATA_FILE) log.info('define.PURE_DATA_FILE: %s', PURE_DATA_FILE) log.info('define.LOG_FILE: %s', LOG_FILE)
log.info('init finish...')
url = 'https://top.baidu.com/board?tab=realtime' log.info('response.url: %s', url) response = requests.get(url) log.info('response.status_code: %s', response.status_code) with open(RESPONSE_DATA_FILE, 'w') as file: file.write(response.text) log.info('write response.text to %s', RESPONSE_DATA_FILE)
if response.status_code == 200: log.info('request success') soup = BeautifulSoup(response.text, 'html.parser')
comment_sdata = soup.find_all(string=lambda text: isinstance(text, Comment) and text.strip().startswith(SDATA_STARTS)) data_text = comment_sdata[0][len(SDATA_STARTS):] log.info('parse data success') with open(PURE_DATA_FILE, 'w') as file: file.write(data_text) log.info('write data to %s', PURE_DATA_FILE) else: log.info('request failed')
log.info('--------------------------------------------------------------------------------')
|