1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
|
#!/usr/bin/env python3
# Copyright 2014-2017 Facundo Batista
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 3, as published
# by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranties of
# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
# PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program. If not, see <http://www.gnu.org/licenses/>.
#
# For further info, check https://launchpad.net/encuentro
"""Scrapers for the decimequiensosvos backend."""
import datetime
import sys
from collections import namedtuple
from yaswfp import swfparser
Episode = namedtuple("Episode", "name occup bio image date")
class _ConstantPoolExtractor(object):
"""Get items from the constant pool."""
def __init__(self, constants, actions):
self.constants = constants
self.actions = actions
def get(self, *keys):
"""Get the text after some key."""
values = {}
stack = []
for act in self.actions:
if act.name == 'ActionPush':
if act.Type == 7:
idx = act.Integer
elif act.Type == 8:
idx = act.Constant8
elif act.Type in (5, 6):
continue
else:
raise ValueError("Bad act type: " + repr(act))
try:
val = self.constants[idx]
except IndexError:
stack.append(None)
else:
if val.startswith('titulo') and val.endswith('1'):
# hard group break!!!
values = {}
stack = []
stack.append(val)
elif act.name in ('ActionSetVariable', 'ActionSetMember'):
if len(stack) == 2:
title, value = stack
if title in keys:
values[title] = value
if len(values) == len(keys):
return values
stack = []
else:
stack = []
def _fix_date(date):
"""Fix and improve the date info."""
datestr = date.split()[0]
if datestr.isupper():
return None
if "-" in datestr:
datestr = "/".join(x.split("-")[0] for x in datestr.split("/"))
dt = datetime.datetime.strptime(datestr, "%d/%m/%y")
date = dt.date()
return date
def _fix_occup(occup):
"""Fix and improve the occupation info."""
occup = occup.strip()
if not occup:
return ""
occup = occup[0].upper() + occup[1:]
if occup[-1] != ".":
occup = occup + "."
# assure all the letters after a period is in uppercase
pos_from = 0
while True:
try:
pos = occup.index(".", pos_from)
except ValueError:
break
pos_from = pos + 1
pos += 2 # second letter after the point
if pos < len(occup):
occup = occup[:pos] + occup[pos].upper() + occup[pos + 1:]
return occup
def _fix_bio(bio):
"""Fix and improve the bio info."""
bio = bio.strip()
return bio
def _fix_name(name):
"""Fix and improve the name info."""
name = name.replace(""", '"') # translate quotes
name = name.split('<')[0] # ignore everything after html tag
return name
def scrap(fh, custom_order=None):
"""Get useful info from a program."""
swf = swfparser.SWFParser(fh)
# get the images
base = None
images = []
for tag in swf.tags:
if tag.name == 'JPEGTables':
base = tag.JPEGData
elif tag.name == 'DefineBits':
images.append((tag.CharacterID, tag.JPEGData))
elif tag.name == 'DefineBitsJPEG2':
images.append((tag.CharacterID, tag.ImageData))
images = [base + x[1] for x in sorted(images, reverse=True)]
# get the last DefineSprite
defsprite = None
for tag in swf.tags:
if tag.name == 'DefineSprite':
defsprite = tag
assert tag is not None, "DefineSprite not found"
# get the actions
doaction = defsprite.ControlTags[0]
for act in doaction.Actions:
if act.name == 'ActionConstantPool':
break
else:
if len(images) < 3:
# not enough images and no constant pool: a non-programs swf!
return []
raise ValueError("No ActionConstantPool found!")
# do some magic to retrieve the texts
cpe = _ConstantPoolExtractor(act.ConstantPool, doaction.Actions)
i = 0
all_vals = []
while True:
i += 1
name = 'titulo%d1' % i
occup = 'titulo%d2' % i
bio = 'htmlText'
date = 'titulo%d3' % i
vals = cpe.get(name, occup, bio, date)
if vals is None:
break
# useful if have a real name, otherwise it's a warning or something
if vals[name]:
all_vals.append((vals[name], vals[occup], vals[bio], vals[date]))
items = []
for i, (name, occup, bio, date) in enumerate(all_vals):
date = _fix_date(date)
if date is None:
continue
occup = _fix_occup(occup)
bio = _fix_bio(bio)
name = _fix_name(name)
# use the corresponding image, or through the custom order
if custom_order is None:
idx = i
else:
try:
idx = custom_order.index(name)
except:
continue
image = images[idx]
ep = Episode(name=name, occup=occup, bio=bio, image=image, date=date)
items.append(ep)
return items
if __name__ == '__main__':
if len(sys.argv) != 2:
print("Usage: scrapers_dqsv.py file.swf")
exit()
custom_order = None
with open(sys.argv[1], 'rb') as fh:
episodes = scrap(fh, custom_order)
for i, ep in enumerate(episodes):
print("Saving img {} for {}".format(i, ep.name))
with open("scraper-img-{}.jpeg".format(i), "wb") as fh:
fh.write(ep.image)
|