뻘짓.py

#!/usr/bin/env python
import re
import requests
import codecs
import six

MEMO_REGEX = re.compile(r'<textarea [^>]* name="memo" [^>]* placeholder="\s*([^"]*)\s*"[^>]*>')

def pick(s):
    resp = s.get("http://m.todayhumor.co.kr/view.php?table=databox&no=18425")
    if resp.status_code != 200:
        raise IOError("HTTP %d" % (resp.status_code,))
    resp.encoding = "utf-8"
    text = resp.text
    del resp
    return (match.group(1).replace("\r", "&#13;").replace("\n", "&#10;") for match in MEMO_REGEX.finditer(text))

def load(fname):
    with codecs.open(fname, "r", "utf-8") as f:
        cmts = dict()
        for line in f:
            freq, txt = line.rstrip().split(" ", 1)
            cmts[txt] = int(freq)
        return cmts

def save(fname, cmts):
    with codecs.open(fname, "w", "utf-8") as f:
        for txt, freq in six.iteritems(cmts):
            f.write("%d %s\n" % (freq, txt))

if __name__ == '__main__':
    import time

    s = requests.Session()
    s.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; OU-Witty-Comment-Collector v0.1; written by stdout, mn=581777)'})
    REF = "http://m.todayhumor.co.kr/"
    with open(".cred", "r") as f:
        username = f.readline().strip()
        password = f.readline().strip()
    resp = s.post("https://www.todayhumor.co.kr/member/m_login_end.php", data=dict(ref=REF, id=username, passwd=password), headers={'Referer': REF})
    if resp.status_code != 200:
        raise IOError("Login HTTP %d" % (resp.status_code,))
    if "member_no" not in s.cookies:
        raise IOError("Login failure")

    try:
        cmts = load("witties.txt")
    except (OSError, IOError):
        cmts = dict()

    try:
        i = 0
        while True:
            i += 1
            if i >= 256:
                save("witties.txt", cmts)
                i = 0
            found = False
            for txt in pick(s):
                found = True
                if txt not in cmts:
                    print(txt)
                    cmts[txt] = 1
                else:
                    cmts[txt] += 1
            if not found:
                raise IOError("Something went wrong!")
            del txt
            time.sleep(10)
    except:
        save("witties.txt", cmts)
        raise