Automatic feed summaries

What do you get if you merge this and this?

You get a scary little filter that extract the top 5 keywords from a post content and adds them to the titles.

Useful titles for all posts: how about that?

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/python

# Copyright (C) 2007 Enrico Zini <enrico@debian.org>
# This software is licensed under the therms of the GNU General Public
# License, version 2 or later.

import libxml2, re, math

tokenize = re.compile(r"\W+")

doc = libxml2.parseFile("-")
root = doc.getRootElement()

# Create an xpath context and register the namespaces
xpc = doc.xpathNewContext()
for d in root.nsDefs():
    if d.name == None:
        xpc.xpathRegisterNs("rss", d.content)
    else:
        xpc.xpathRegisterNs(d.name, d.content)

# Collect text stats to generate summaries
doc_tokens = {}
for x in xpc.xpathEval("/rdf:RDF/rss:item"):
    res = x.nsProp("about", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    tokens = {}
    for c in x.children:
        if c.type != 'element' or (c.name != 'encoded' and c.name != 'title'):
            continue
        for t in c.children:
            if t.type != 'text': continue
            for tok in tokenize.split(t.content):
                if tok in tokens:
                    tokens[tok] += 1
                else:
                    tokens[tok] = 1
    doc_tokens[res] = tokens

# Aggregate token counts
aggregated = {}
for d in doc_tokens.itervalues():
    for tok, count in d.iteritems():
        tok = tok.lower()
        if tok in aggregated:
            aggregated[tok] += count
        else:
            aggregated[tok] = count

def tfidf(doc, tok):
    "Compute TFIDF score of a token in a document"
    tok = tok.lower()
    return doc_tokens[doc].get(tok, 0) * math.log(float(len(doc_tokens)) / aggregated.get(tok, 0))

def top5(doc):
    # Output the top 5 tokens by TFIDF
    return sorted(doc_tokens[doc].keys(), key=lambda tok: tfidf(doc, tok), reverse=True)[:5]


# Rewrite the titles
for x in xpc.xpathEval("/rdf:RDF/rss:item"):
    res = x.nsProp("about", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    for c in x.children:
        if c.type != 'element' or c.name != 'title':
            continue
        for t in c.children:
            if t.type != 'text': continue
            name, content = t.content.split(":", 1)
            summary = ", ".join(top5(res))
            t.replaceNode(doc.newDocText(name+" ["+summary+"]:"+content))

# Serialize the result
print doc.saveFormatFile("-", True)