-
Notifications
You must be signed in to change notification settings - Fork 1
/
wp_selfcites.py
87 lines (71 loc) · 2.46 KB
/
wp_selfcites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
WP self-cites
This (dirty) script parses a Wordpress export XML, tracing the links to posts of
the same blog, and outputs a GEXF-formatted directed graph of the self-citations.
Examples: http://juanfont.eu/graph/test.html
http://box.jisko.net/d/f59c0a02
Use it as you like.
2013/12/04 - Juan Font - [email protected]
"""
import xml.etree.ElementTree as ET
from gexf import Gexf
import re
GEXF_OUTPUT_FILE='politikon.gexf'
INPUT_WP_XML='/home/juan/politikon.wordpress.2011.xml'
def get_post_links(content):
if content:
return re.findall("http://politikon.es/\d\d\d\d/\d\d/[\d\d/]*[\w*-]*/", content)
else:
return []
def get_color(author):
if not author:
return (255, 255, 255)
if author=='rsenserrich':
return (255,0,0)
if author=='kikollaneras':
return (119,0,119)
if author=='jorgegalindo':
return (0,0,255)
if author=='jorgesanmiguel':
return (255,128,0)
if author=='pablosimon':
return (255,255,0)
if author=='juanfont':
return (0,255,255)
if author=='cives':
return (0,255,0)
if author=='ramonmateo':
return (26,83,24)
if author=='kantor':
return (194,78,78)
if author=='octavio-medina':
return (93,0,0)
return (255,255,255)
def main():
gexf = Gexf("Politikon","Autoreferences graph")
graph=gexf.addGraph("directed","static","a hello world graph")
tree = ET.parse(INPUT_WP_XML)
root = tree.getroot()
orphan = []
for item in root[0].findall('item'):
if re.match("http://politikon.es/\d\d\d\d/\d\d/\d\d/([\w*-]*)/", item.find('link').text):
post_id = re.match("http://politikon.es/\d\d\d\d/\d\d/\d\d/([\w*-]*)/", item.find('link').text).group(1)
post_link = item.find('link').text
r,g,b = get_color(item.find('{http://purl.org/dc/elements/1.1/}creator').text)
graph.addNode(id=str(post_id),label=str(post_link), r=str(r), g=str(g), b=str(b))
links = get_post_links(item.find('{http://purl.org/rss/1.0/modules/content/}encoded').text)
if links:
for link in links:
link_id = re.match("http://politikon.es/\d\d\d\d/\d\d/[\d\d/]*([\w*-]*)/", link).group(1)
if link_id in orphan:
orphan.remove(link_id)
if graph.nodeExists(link_id):
graph.addEdge(post_id+"->"+link_id, post_id, link_id)
else:
orphan.append(post_id)
for o in orphan:
del graph._nodes[o]
output_file=open(GEXF_OUTPUT_FILE,"w")
gexf.write(output_file)
if __name__ == "__main__":
main()