-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathndashredir.py
163 lines (153 loc) · 7.08 KB
/
ndashredir.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# -*- coding: utf-8 -*-
"""
This script will collect articles that have n dash or m dash character in their
title, and create a redirect to them automatically from the corresponding
hyphenated title. If the target exists, will be skipped.
It may take several hours. You may quit by Ctrl C at any time and continue
later. Type the first few characters of the last shown title after -start.
The script is primarily designed for work in article namespace, but can be used
in any other one. Use in accordance with the rules of your community.
Known parameters:
-start Will start from the given title (it does not have to exist).
Parameter may be given as "-start" or "-start:title"
Defaults to '!'.
-namespace Works in the given namespace (only one at a time). Parameter
-ns may be given as "-ns:<number>" or "-namespace:<number>".
Defaults to 0 (main namespace).
-nosub Will not process subpages. Useful in template or portal
namespace. (Not recommended for main namespace that has no
real subpages.)
-save Saves the title of existing hyphenated articles whose content
is _other_ than a redirect to the corresponding article with
n dash or m dash in the title and thus may need manual
treatment. If omitted, these titles will be written only to
the screen (or the log if logging is on). The file is in the
form you may upload it to a wikipage.
May be given as "-save:<filename>". If it exists, titles
will be appended.
After checking these titles, you may want to write them to
your ignore file (see below).
-ignore A file that contains titles that are not to be claimed to
redirect somewhere else. For example, if X-1 (with hyphen)
redirects to a disambiguation page that lists X–1 (with n
dash), that's OK and you don't want it to appear at each run
as a problematic article.
File must be encoded in UTF-8 and contain titles among double
square brackets (e.g. *[[X-1]] or [[:File:X-1.gif]]).
May be given as "-ignore:<filename>".
"""
#
# (C) Bináris, 2012
#
# Distributed under the terms of the MIT license.
#
__version__='$Id$'
import codecs, re
import wikipedia as pywikibot
from pagegenerators import RegexFilterPageGenerator as RPG
from pywikibot import i18n
def main(*args):
regex = ur'.*[–—]' # Alt 0150 (n dash), alt 0151 (m dash), respectively.
ns = 0
start = '!'
filename = None # The name of the file to save titles
titlefile = None # The file object itself
ignorefilename = None # The name of the ignore file
ignorelist = [] # A list to ignore titles that redirect to somewhere else
# Handling parameters:
for arg in pywikibot.handleArgs(*args):
if arg == '-start':
start = pywikibot.input(
u'From which title do you want to continue?')
elif arg.startswith('-start:'):
start = arg[7:]
elif arg in ['-ns', '-namespace']:
ns = pywikibot.input(u'Which namespace should we process?')
elif arg.startswith('-ns:') or arg.startswith('-namespace:'):
ns = arg[arg.find(':')+1:]
elif arg == '-nosub':
regex = ur'[^/]*[–—][^/]*$'
elif arg == '-save':
filename = pywikibot.input('Please enter the filename:')
elif arg.startswith('-save:'):
filename = arg[6:]
elif arg == '-ignore':
ignorefilename = pywikibot.input('Please enter the filename:')
elif arg.startswith('-ignore:'):
ignorefilename = arg[8:]
# File operations:
if filename:
try:
# This opens in strict error mode, that means bot will stop
# on encoding errors with ValueError.
# See http://docs.python.org/library/codecs.html#codecs.open
titlefile = codecs.open(filename, encoding='utf-8', mode='a')
except IOError:
pywikibot.output("%s cannot be opened for writing." % filename)
return
if ignorefilename:
try:
igfile = codecs.open(ignorefilename, encoding='utf-8', mode='r')
ignorelist = re.findall(ur'\[\[:?(.*?)\]\]', igfile.read())
igfile.close()
except IOError:
pywikibot.output("%s cannot be opened for reading." % ignorefilename)
return
# Ready to initialize
site = pywikibot.getSite()
redirword = site.redirect()
gen = RPG(site.allpages(
start=start, namespace=ns, includeredirects=False), [regex])
# Processing:
for page in gen:
title = page.title()
editSummary = i18n.twtranslate(site, 'ndashredir-create',
{'title': title})
newtitle = title.replace(u'–','-').replace(u'—','-')
# n dash -> hyphen, m dash -> hyphen, respectively
redirpage = pywikibot.Page(site, newtitle)
if redirpage.exists():
if redirpage.isRedirectPage() and \
redirpage.getRedirectTarget() == page:
pywikibot.output(
u'[[%s]] already redirects to [[%s]], nothing to do with it.'
% (newtitle, title))
elif newtitle in ignorelist:
pywikibot.output(
u'Skipping [[%s]] because it is on your ignore list.'
% newtitle)
else:
pywikibot.output(
(u'\03{lightyellow}Skipping [[%s]] because it exists '
u'already with a different content.\03{default}')
% newtitle)
if titlefile:
s = u'\n#%s does not redirect to %s.' %\
(redirpage.title(asLink=True, textlink=True),
page.title(asLink=True, textlink=True))
# For the unlikely case if someone wants to run it in
# file namespace.
titlefile.write(s)
titlefile.flush()
else:
text = u'#%s[[%s]]' % (redirword, title)
try:
redirpage.put(text, editSummary)
except pywikibot.LockedPage, err:
pywikibot.output(
(u'\03{lightyellow}Skipping [[%s]] because it is '
u'protected.\03{default}') % newtitle)
except:
pywikibot.output(
(u'\03{lightyellow}Skipping [[%s]] because of an error.'
u'\03{default}') % newtitle)
# Todo: output the title upon Ctrl C? (KeyboardInterrupt always hits
# RegexFilterPageGenerator or throttle.py or anything else and cannot
# be catched in this loop.)
if titlefile:
titlefile.close() # For the spirit of programming (it was flushed)
if __name__ == "__main__":
try:
main()
finally:
pywikibot.stopme()