#!/usr/bin/env python
import re
import requests
import codecs
import six
MEMO_REGEX = re.compile(r'<textarea [^>]* name="memo" [^>]* placeholder="\s*([^"]*)\s*"[^>]*>')
def pick(s):
resp = s.get("http://m.todayhumor.co.kr/view.php?table=databox&no=18425")
if resp.status_code != 200:
raise IOError("HTTP %d" % (resp.status_code,))
resp.encoding = "utf-8"
text = resp.text
del resp
return (match.group(1).replace("\r", " ").replace("\n", " ") for match in MEMO_REGEX.finditer(text))
def load(fname):
with codecs.open(fname, "r", "utf-8") as f:
cmts = dict()
for line in f:
freq, txt = line.rstrip().split(" ", 1)
cmts[txt] = int(freq)
return cmts
def save(fname, cmts):
with codecs.open(fname, "w", "utf-8") as f:
for txt, freq in six.iteritems(cmts):
f.write("%d %s\n" % (freq, txt))
if __name__ == '__main__':
import time
s = requests.Session()
s.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; OU-Witty-Comment-Collector v0.1; written by stdout, mn=581777)'})
REF = "http://m.todayhumor.co.kr/"
with open(".cred", "r") as f:
username = f.readline().strip()
password = f.readline().strip()
resp = s.post("https://www.todayhumor.co.kr/member/m_login_end.php", data=dict(ref=REF, id=username, passwd=password), headers={'Referer': REF})
if resp.status_code != 200:
raise IOError("Login HTTP %d" % (resp.status_code,))
if "member_no" not in s.cookies:
raise IOError("Login failure")
try:
cmts = load("witties.txt")
except (OSError, IOError):
cmts = dict()
try:
i = 0
while True:
i += 1
if i >= 256:
save("witties.txt", cmts)
i = 0
found = False
for txt in pick(s):
found = True
if txt not in cmts:
print(txt)
cmts[txt] = 1
else:
cmts[txt] += 1
if not found:
raise IOError("Something went wrong!")
del txt
time.sleep(10)
except:
save("witties.txt", cmts)
raise