1
#------------------------------------------------------------------
2
# $Id: RestrictionEnzyme.pm,v 1.35.4.1 2006/10/02 23:10:32 sendu Exp $
4
# BioPerl module Bio::Tools::RestrictionEnzyme
6
# Cared for by Steve Chervitz <sac@bioperl.org>
8
# You may distribute this module under the same terms as perl itself
9
#------------------------------------------------------------------
15
Bio::Tools::RestrictionEnzyme - Bioperl object for a restriction endonuclease
16
(cuts DNA at specific locations)
18
DEPRECATED. Please use the Bio::Restriction modules instead.
22
use Bio::Tools::RestrictionEnzyme;
24
## Create a new object by name.
26
$re1 = Bio::Tools::RestrictionEnzyme->new(-NAME =>'EcoRI');
28
## Create a new object using special syntax
29
## which specifies the enzyme name, recognition site, and cut position.
30
## Used for enzymes not known to this module.
32
$re2 = Bio::Tools::RestrictionEnzyme->new(-NAME =>'EcoRV--GAT^ATC',
35
## Get a list of the resulting fragments when a sequence is cut with
36
## the given enzyme. The method expects a Bio::Seq object.
38
@fragments = $re2->cut_seq($seqobj);
40
## Get a list of names of all available restriction enzymes
41
## known to this module.
43
@all = $re->available_list();
45
## Get the names of restriction enzymes that have 6 bp
46
## recognition sequences.
48
@sixcutters = $re->available_list(6);
52
The Bio::Tools::RestrictionEnzyme.pm module encapsulates generic data and
53
methods for using restriction endonucleases for in silico restriction
54
analysis of DNA sequences.
58
This module is a precursor for a more full featured version that may do such
59
things as download data from online databases such as REBase L<http://rebase.neb.com/>.
60
Thus, there is currently no functionality for obtaining data regarding commercial
61
availability of a restriction enzyme.
63
At some point in the future, it may make sense to derive RestrictionEnzymes
64
from a class such as Bio::Enzyme or Bio::Prot::Protein (neither of which now
65
exist) so that more data about the enzyme and related information can be
68
This module is currently in use at L<http://genome-www.stanford.edu/Sacch3D/analysis/>.
70
=head2 Digesting on Runs of N
72
To digest a sequence on runs of N's in the sequence. Here's what you can do:
74
$re_n = Bio::Tools::RestrictionEnzyme->new(-name=>'N--NNNNN',
77
Specify the number of N's you want to match in the -name parameter.
78
So the above example will recognize and cut at runs of 5 Ns.
79
If you wanted to cut at runs of 10 N's, you would use
81
-name => 'N--NNNNNNNNNN'
83
Note that you must use a specific number of N's, you cannot use a regexp to
84
digest at N+ for example, because the actual number of N's at each site are
85
not recorded when the sequence is analyzed. So cut_locations( ) wouldn't be
90
See the script C<examples/restriction.pl> in the Bioperl distribution.
94
Bio::Tools::RestrictionEnzyme is a concrete class that inherits from
95
L<Bio::Root::Root> and uses by delegation L<Bio::PrimarySeq>.
101
User feedback is an integral part of the evolution of this and other Bioperl
102
modules. Send your comments and suggestions preferably to one of the Bioperl
103
mailing lists. Your participation is much appreciated.
105
bioperl-l@bioperl.org - General discussion
106
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
108
=head2 Reporting Bugs
110
Report bugs to the Bioperl bug tracking system to help us keep track
111
the bugs and their resolution. Bug reports can be submitted via the
114
http://bugzilla.open-bio.org/
118
Steve Chervitz, E<lt>sac-at-bioperl.orgE<gt>
122
Copyright (c) 1997-2002 Steve A. Chervitz. All Rights Reserved.
123
This module is free software; you can redistribute it and/or
124
modify it under the same terms as Perl itself.
136
#### END of main POD documentation.
144
Methods beginning with a leading underscore are considered private
145
and are intended for internal use by this module. They are
146
B<not> considered part of the public interface and are described here
147
for documentation purposes only.
152
package Bio::Tools::RestrictionEnzyme;
155
use vars qw (@EXPORT_OK %EXPORT_TAGS @RE_available);
157
use base qw(Bio::Root::Root Exporter);
158
@EXPORT_OK = qw(@RE_available);
159
%EXPORT_TAGS = ( std => [qw(@RE_available)] );
161
# Generated from REBASE version 208 (strider format), dated Aug 1 2002
162
# using scripts/contributed/rebase2list.pl
163
# Syntax: RE-name => 'SITE CUTS-AT' where SITE and CUTS-AT are separated
167
'AasI' => 'GACNNNNNNGTC 7',
168
'AatI' => 'AGGCCT 3',
169
'AatII' => 'GACGTC 5',
170
'AauI' => 'TGTACA 1',
171
'AccI' => 'GTMKAC 2',
173
'AccIII' => 'TCCGGA 1',
174
'Acc16I' => 'TGCGCA 3',
175
'Acc65I' => 'GGTACC 1',
176
'Acc113I' => 'AGTACT 3',
177
'AccB1I' => 'GGYRCC 1',
178
'AccB7I' => 'CCANNNNNTGG 7',
179
'AclI' => 'AACGTT 2',
180
'AcsI' => 'RAATTY 1',
181
'AcvI' => 'CACGTG 3',
182
'AcyI' => 'GRCGYC 2',
183
'AdeI' => 'CACNNNGTG 6',
185
'AfeI' => 'AGCGCT 3',
187
'AflII' => 'CTTAAG 1',
188
'AflIII' => 'ACRYGT 1',
189
'AgeI' => 'ACCGGT 1',
190
'AhaIII' => 'TTTAAA 3',
191
'AhdI' => 'GACNNNNNGTC 6',
192
'AhlI' => 'ACTAGT 1',
193
'AleI' => 'CACNNNNGTG 5',
195
'Alw21I' => 'GWGCWC 5',
196
'Alw44I' => 'GTGCAC 1',
197
'AlwNI' => 'CAGNNNCTG 6',
198
'Ama87I' => 'CYCGRG 1',
199
'AocI' => 'CCTNAGG 2',
200
'Aor51HI' => 'AGCGCT 3',
201
'ApaI' => 'GGGCCC 5',
202
'ApaBI' => 'GCANNNNNTGC 8',
203
'ApaLI' => 'GTGCAC 1',
204
'ApoI' => 'RAATTY 1',
205
'AscI' => 'GGCGCGCC 2',
206
'AseI' => 'ATTAAT 2',
207
'AsiAI' => 'ACCGGT 1',
208
'AsiSI' => 'GCGATCGC 5',
209
'AsnI' => 'ATTAAT 2',
210
'AspI' => 'GACNNNGTC 4',
211
'Asp700I' => 'GAANNNNTTC 5',
212
'Asp718I' => 'GGTACC 1',
213
'AspEI' => 'GACNNNNNGTC 6',
214
'AspHI' => 'GWGCWC 5',
215
'AspLEI' => 'GCGC 3',
216
'AspS9I' => 'GGNCC 1',
218
'AsuII' => 'TTCGAA 2',
219
'AsuC2I' => 'CCSGG 2',
220
'AsuNHI' => 'GCTAGC 1',
221
'AvaI' => 'CYCGRG 1',
222
'AvaII' => 'GGWCC 1',
223
'AviII' => 'TGCGCA 3',
224
'AvrII' => 'CCTAGG 1',
225
'AxyI' => 'CCTNAGG 2',
226
'BalI' => 'TGGCCA 3',
227
'BamHI' => 'GGATCC 1',
228
'BanI' => 'GGYRCC 1',
229
'BanII' => 'GRGCYC 5',
230
'BanIII' => 'ATCGAT 2',
231
'BbeI' => 'GGCGCC 5',
232
'BbrPI' => 'CACGTG 3',
233
'BbuI' => 'GCATGC 5',
234
'Bbv12I' => 'GWGCWC 5',
235
'BclI' => 'TGATCA 1',
237
'BcoI' => 'CYCGRG 1',
238
'BcuI' => 'ACTAGT 1',
239
'BetI' => 'WCCGGW 1',
241
'BfmI' => 'CTRYAG 1',
242
'BfrI' => 'CTTAAG 1',
243
'BfrBI' => 'ATGCAT 3',
245
'BglI' => 'GCCNNNNNGGC 7',
246
'BglII' => 'AGATCT 1',
247
'BlnI' => 'CCTAGG 1',
248
'BloHII' => 'CTGCAG 5',
249
'BlpI' => 'GCTNAGC 2',
250
'Bme18I' => 'GGWCC 1',
251
'Bme1390I' => 'CCNGG 2',
252
'Bme1580I' => 'GKGCMC 5',
253
'BmtI' => 'GCTAGC 5',
254
'BmyI' => 'GDGCHC 5',
255
'BoxI' => 'GACNNNNGTC 5',
256
'Bpu14I' => 'TTCGAA 2',
257
'Bpu1102I' => 'GCTNAGC 2',
258
'Bsa29I' => 'ATCGAT 2',
259
'BsaAI' => 'YACGTR 3',
260
'BsaBI' => 'GATNNNNATC 5',
261
'BsaHI' => 'GRCGYC 2',
262
'BsaJI' => 'CCNNGG 1',
263
'BsaOI' => 'CGRYCG 4',
264
'BsaWI' => 'WCCGGW 1',
265
'BscI' => 'ATCGAT 2',
266
'Bsc4I' => 'CCNNNNNNNGG 7',
267
'BscBI' => 'GGNNCC 3',
269
'Bse8I' => 'GATNNNNATC 5',
270
'Bse21I' => 'CCTNAGG 2',
271
'Bse118I' => 'RCCGGY 1',
272
'BseAI' => 'TCCGGA 1',
273
'BseBI' => 'CCWGG 2',
274
'BseCI' => 'ATCGAT 2',
275
'BseDI' => 'CCNNGG 1',
276
'BseJI' => 'GATNNNNATC 5',
277
'BseLI' => 'CCNNNNNNNGG 7',
278
'BsePI' => 'GCGCGC 1',
279
'BseSI' => 'GKGCMC 5',
280
'BseX3I' => 'CGGCCG 1',
282
'Bsh1236I' => 'CGCG 2',
283
'Bsh1285I' => 'CGRYCG 4',
285
'BshNI' => 'GGYRCC 1',
286
'BshTI' => 'ACCGGT 1',
287
'BsiBI' => 'GATNNNNATC 5',
288
'BsiCI' => 'TTCGAA 2',
289
'BsiEI' => 'CGRYCG 4',
290
'BsiHKAI' => 'GWGCWC 5',
291
'BsiHKCI' => 'CYCGRG 1',
292
'BsiLI' => 'CCWGG 2',
293
'BsiMI' => 'TCCGGA 1',
294
'BsiQI' => 'TGATCA 1',
296
'BsiWI' => 'CGTACG 1',
297
'BsiXI' => 'ATCGAT 2',
298
'BsiYI' => 'CCNNNNNNNGG 7',
299
'BsiZI' => 'GGNCC 1',
300
'BslI' => 'CCNNNNNNNGG 7',
301
'BsoBI' => 'CYCGRG 1',
302
'Bsp13I' => 'TCCGGA 1',
303
'Bsp19I' => 'CCATGG 1',
304
'Bsp68I' => 'TCGCGA 3',
305
'Bsp106I' => 'ATCGAT 2',
306
'Bsp119I' => 'TTCGAA 2',
307
'Bsp120I' => 'GGGCCC 1',
308
'Bsp143I' => 'GATC 0',
309
'Bsp143II' => 'RGCGCY 5',
310
'Bsp1286I' => 'GDGCHC 5',
311
'Bsp1407I' => 'TGTACA 1',
312
'Bsp1720I' => 'GCTNAGC 2',
313
'BspA2I' => 'CCTAGG 1',
314
'BspCI' => 'CGATCG 4',
315
'BspDI' => 'ATCGAT 2',
316
'BspEI' => 'TCCGGA 1',
317
'BspHI' => 'TCATGA 1',
318
'BspLI' => 'GGNNCC 3',
319
'BspLU11I' => 'ACATGT 1',
320
'BspMII' => 'TCCGGA 1',
321
'BspTI' => 'CTTAAG 1',
322
'BspT104I' => 'TTCGAA 2',
323
'BspT107I' => 'GGYRCC 1',
324
'BspXI' => 'ATCGAT 2',
325
'BsrBRI' => 'GATNNNNATC 5',
326
'BsrFI' => 'RCCGGY 1',
327
'BsrGI' => 'TGTACA 1',
328
'BssAI' => 'RCCGGY 1',
329
'BssECI' => 'CCNNGG 1',
330
'BssHI' => 'CTCGAG 1',
331
'BssHII' => 'GCGCGC 1',
332
'BssKI' => 'CCNGG 0',
333
'BssNAI' => 'GTATAC 3',
334
'BssT1I' => 'CCWWGG 1',
335
'Bst98I' => 'CTTAAG 1',
336
'Bst1107I' => 'GTATAC 3',
337
'BstACI' => 'GRCGYC 2',
338
'BstAPI' => 'GCANNNNNTGC 7',
339
'BstBI' => 'TTCGAA 2',
340
'BstBAI' => 'YACGTR 3',
341
'Bst4CI' => 'ACNGT 3',
342
'BstC8I' => 'GCNNGC 3',
343
'BstDEI' => 'CTNAG 1',
344
'BstDSI' => 'CCRYGG 1',
345
'BstEII' => 'GGTNACC 1',
346
'BstENI' => 'CCTNNNNNAGG 5',
347
'BstENII' => 'GATC 0',
348
'BstFNI' => 'CGCG 2',
349
'BstH2I' => 'RGCGCY 5',
350
'BstHHI' => 'GCGC 3',
351
'BstHPI' => 'GTTAAC 3',
352
'BstKTI' => 'GATC 3',
353
'BstMAI' => 'CTGCAG 5',
354
'BstMCI' => 'CGRYCG 4',
355
'BstMWI' => 'GCNNNNNNNGC 7',
356
'BstNI' => 'CCWGG 2',
357
'BstNSI' => 'RCATGY 5',
358
'BstOI' => 'CCWGG 2',
359
'BstPI' => 'GGTNACC 1',
360
'BstPAI' => 'GACNNNNGTC 5',
361
'BstSCI' => 'CCNGG 0',
362
'BstSFI' => 'CTRYAG 1',
363
'BstSNI' => 'TACGTA 3',
365
'Bst2UI' => 'CCWGG 2',
366
'BstXI' => 'CCANNNNNNTGG 8',
367
'BstX2I' => 'RGATCY 1',
368
'BstYI' => 'RGATCY 1',
369
'BstZI' => 'CGGCCG 1',
370
'BstZ17I' => 'GTATAC 3',
371
'Bsu15I' => 'ATCGAT 2',
372
'Bsu36I' => 'CCTNAGG 2',
374
'BsuTUI' => 'ATCGAT 2',
375
'BtgI' => 'CCRYGG 1',
376
'BthCI' => 'GCNGC 4',
377
'Cac8I' => 'GCNNGC 3',
378
'CaiI' => 'CAGNNNCTG 6',
379
'CauII' => 'CCSGG 2',
380
'CciNI' => 'GCGGCCGC 2',
381
'CelII' => 'GCTNAGC 2',
383
'CfrI' => 'YGGCCR 1',
384
'Cfr9I' => 'CCCGGG 1',
385
'Cfr10I' => 'RCCGGY 1',
386
'Cfr13I' => 'GGNCC 1',
387
'Cfr42I' => 'CCGCGG 4',
389
'ClaI' => 'ATCGAT 2',
390
'CpoI' => 'CGGWCCG 2',
391
'CspI' => 'CGGWCCG 2',
393
'Csp45I' => 'TTCGAA 2',
394
'CspAI' => 'ACCGGT 1',
395
'CviAII' => 'CATG 1',
399
'CvnI' => 'CCTNAGG 2',
403
'DraI' => 'TTTAAA 3',
404
'DraII' => 'RGGNCCY 2',
405
'DraIII' => 'CACNNNGTG 6',
406
'DrdI' => 'GACNNNNNNGTC 7',
407
'DsaI' => 'CCRYGG 1',
408
'DseDI' => 'GACNNNNNNGTC 7',
409
'EaeI' => 'YGGCCR 1',
410
'EagI' => 'CGGCCG 1',
411
'Eam1105I' => 'GACNNNNNGTC 6',
412
'Ecl136II' => 'GAGCTC 3',
413
'EclHKI' => 'GACNNNNNGTC 6',
414
'EclXI' => 'CGGCCG 1',
415
'Eco24I' => 'GRGCYC 5',
416
'Eco32I' => 'GATATC 3',
417
'Eco47I' => 'GGWCC 1',
418
'Eco47III' => 'AGCGCT 3',
419
'Eco52I' => 'CGGCCG 1',
420
'Eco72I' => 'CACGTG 3',
421
'Eco81I' => 'CCTNAGG 2',
422
'Eco88I' => 'CYCGRG 1',
423
'Eco91I' => 'GGTNACC 1',
424
'Eco105I' => 'TACGTA 3',
425
'Eco130I' => 'CCWWGG 1',
426
'Eco147I' => 'AGGCCT 3',
427
'EcoHI' => 'CCSGG 0',
428
'EcoICRI' => 'GAGCTC 3',
429
'EcoNI' => 'CCTNNNNNAGG 5',
430
'EcoO65I' => 'GGTNACC 1',
431
'EcoO109I' => 'RGGNCCY 2',
432
'EcoRI' => 'GAATTC 1',
433
'EcoRII' => 'CCWGG 0',
434
'EcoRV' => 'GATATC 3',
435
'EcoT14I' => 'CCWWGG 1',
436
'EcoT22I' => 'ATGCAT 5',
437
'EcoT38I' => 'GRGCYC 5',
438
'EgeI' => 'GGCGCC 3',
439
'EheI' => 'GGCGCC 3',
440
'ErhI' => 'CCWWGG 1',
441
'EsaBC3I' => 'TCGA 2',
442
'EspI' => 'GCTNAGC 2',
444
'FauNDI' => 'CATATG 2',
445
'FbaI' => 'TGATCA 1',
446
'FblI' => 'GTMKAC 2',
448
'FnuDII' => 'CGCG 2',
449
'Fnu4HI' => 'GCNGC 2',
450
'FriOI' => 'GRGCYC 5',
451
'FseI' => 'GGCCGGCC 6',
452
'FspI' => 'TGCGCA 3',
453
'FspAI' => 'RTGCGCAY 4',
454
'Fsp4HI' => 'GCNGC 2',
455
'FunI' => 'AGCGCT 3',
456
'FunII' => 'GAATTC 1',
457
'HaeI' => 'WGGCCW 3',
458
'HaeII' => 'RGCGCY 5',
459
'HaeIII' => 'GGCC 2',
461
'HgiAI' => 'GWGCWC 5',
462
'HgiCI' => 'GGYRCC 1',
463
'HgiJII' => 'GRGCYC 5',
465
'Hin1I' => 'GRCGYC 2',
467
'HinP1I' => 'GCGC 1',
468
'HincII' => 'GTYRAC 3',
469
'HindII' => 'GTYRAC 3',
470
'HindIII' => 'AAGCTT 1',
471
'HinfI' => 'GANTC 1',
472
'HpaI' => 'GTTAAC 3',
474
'Hpy8I' => 'GTNNAC 3',
475
'Hpy99I' => 'CGWCG 5',
476
'Hpy178III' => 'TCNNGA 2',
477
'Hpy188I' => 'TCNGA 3',
478
'Hpy188III' => 'TCNNGA 2',
479
'HpyCH4I' => 'CATG 3',
480
'HpyCH4III' => 'ACNGT 3',
481
'HpyCH4IV' => 'ACGT 1',
482
'HpyCH4V' => 'TGCA 2',
483
'HpyF10VI' => 'GCNNNNNNNGC 8',
484
'Hsp92I' => 'GRCGYC 2',
485
'Hsp92II' => 'CATG 4',
488
'KasI' => 'GGCGCC 1',
489
'KpnI' => 'GGTACC 5',
490
'Kpn2I' => 'TCCGGA 1',
491
'KspI' => 'CCGCGG 4',
492
'Ksp22I' => 'TGATCA 1',
493
'KspAI' => 'GTTAAC 3',
495
'LpnI' => 'RGCGCY 3',
496
'LspI' => 'TTCGAA 2',
497
'MabI' => 'ACCWGGT 1',
500
'MaeIII' => 'GTNAC 0',
501
'MamI' => 'GATNNNNATC 5',
503
'McrI' => 'CGRYCG 4',
504
'MfeI' => 'CAATTG 1',
505
'MflI' => 'RGATCY 1',
506
'MhlI' => 'GDGCHC 5',
507
'MlsI' => 'TGGCCA 3',
508
'MluI' => 'ACGCGT 1',
509
'MluNI' => 'TGGCCA 3',
510
'Mly113I' => 'GGCGCC 2',
511
'Mph1103I' => 'ATGCAT 5',
512
'MroI' => 'TCCGGA 1',
513
'MroNI' => 'GCCGGC 1',
514
'MroXI' => 'GAANNNNTTC 5',
515
'MscI' => 'TGGCCA 3',
517
'MslI' => 'CAYNNNNRTG 5',
519
'Msp20I' => 'TGGCCA 3',
520
'MspA1I' => 'CMGCKG 3',
521
'MspCI' => 'CTTAAG 1',
522
'MspR9I' => 'CCNGG 2',
523
'MssI' => 'GTTTAAAC 4',
524
'MstI' => 'TGCGCA 3',
525
'MunI' => 'CAATTG 1',
528
'MwoI' => 'GCNNNNNNNGC 7',
529
'NaeI' => 'GCCGGC 3',
530
'NarI' => 'GGCGCC 2',
532
'NcoI' => 'CCATGG 1',
533
'NdeI' => 'CATATG 2',
535
'NgoAIV' => 'GCCGGC 1',
536
'NgoMIV' => 'GCCGGC 1',
537
'NheI' => 'GCTAGC 1',
538
'NlaIII' => 'CATG 4',
539
'NlaIV' => 'GGNNCC 3',
540
'Nli3877I' => 'CYCGRG 5',
541
'NmuCI' => 'GTSAC 0',
542
'NotI' => 'GCGGCCGC 2',
543
'NruI' => 'TCGCGA 3',
544
'NruGI' => 'GACNNNNNGTC 6',
545
'NsbI' => 'TGCGCA 3',
546
'NsiI' => 'ATGCAT 5',
547
'NspI' => 'RCATGY 5',
548
'NspIII' => 'CYCGRG 1',
549
'NspV' => 'TTCGAA 2',
550
'NspBII' => 'CMGCKG 3',
551
'OliI' => 'CACNNNNGTG 5',
552
'PacI' => 'TTAATTAA 5',
553
'PaeI' => 'GCATGC 5',
554
'PaeR7I' => 'CTCGAG 1',
555
'PagI' => 'TCATGA 1',
557
'PauI' => 'GCGCGC 1',
558
'PceI' => 'AGGCCT 3',
559
'PciI' => 'ACATGT 1',
560
'PdiI' => 'GCCGGC 3',
561
'PdmI' => 'GAANNNNTTC 5',
562
'Pfl23II' => 'CGTACG 1',
563
'PflBI' => 'CCANNNNNTGG 7',
564
'PflFI' => 'GACNNNGTC 4',
565
'PflMI' => 'CCANNNNNTGG 7',
566
'PfoI' => 'TCCNGGA 1',
567
'PinAI' => 'ACCGGT 1',
568
'Ple19I' => 'CGATCG 4',
569
'PmaCI' => 'CACGTG 3',
570
'PmeI' => 'GTTTAAAC 4',
571
'PmlI' => 'CACGTG 3',
572
'Ppu10I' => 'ATGCAT 1',
573
'PpuMI' => 'RGGWCCY 2',
574
'PpuXI' => 'RGGWCCY 2',
575
'PshAI' => 'GACNNNNGTC 5',
576
'PshBI' => 'ATTAAT 2',
577
'PsiI' => 'TTATAA 3',
578
'Psp03I' => 'GGWCC 4',
579
'Psp5II' => 'RGGWCCY 2',
580
'Psp6I' => 'CCWGG 0',
581
'Psp1406I' => 'AACGTT 2',
582
'PspAI' => 'CCCGGG 1',
583
'Psp124BI' => 'GAGCTC 5',
584
'PspEI' => 'GGTNACC 1',
585
'PspGI' => 'CCWGG 0',
586
'PspLI' => 'CGTACG 1',
587
'PspN4I' => 'GGNNCC 3',
588
'PspOMI' => 'GGGCCC 1',
589
'PspPI' => 'GGNCC 1',
590
'PspPPI' => 'RGGWCCY 2',
591
'PssI' => 'RGGNCCY 5',
592
'PstI' => 'CTGCAG 5',
593
'PsuI' => 'RGATCY 1',
594
'PsyI' => 'GACNNNGTC 4',
595
'PvuI' => 'CGATCG 4',
596
'PvuII' => 'CAGCTG 3',
597
'RcaI' => 'TCATGA 1',
599
'RsrII' => 'CGGWCCG 2',
600
'Rsr2I' => 'CGGWCCG 2',
601
'SacI' => 'GAGCTC 5',
602
'SacII' => 'CCGCGG 4',
603
'SalI' => 'GTCGAC 1',
604
'SanDI' => 'GGGWCCC 2',
606
'SauI' => 'CCTNAGG 2',
607
'Sau96I' => 'GGNCC 1',
608
'Sau3AI' => 'GATC 0',
609
'SbfI' => 'CCTGCAGG 6',
610
'ScaI' => 'AGTACT 3',
611
'SciI' => 'CTCGAG 3',
612
'ScrFI' => 'CCNGG 2',
613
'SdaI' => 'CCTGCAGG 6',
614
'SduI' => 'GDGCHC 5',
615
'SecI' => 'CCNNGG 1',
617
'SexAI' => 'ACCWGGT 1',
618
'SfcI' => 'CTRYAG 1',
619
'SfeI' => 'CTRYAG 1',
620
'SfiI' => 'GGCCNNNNNGGCC 8',
621
'SfoI' => 'GGCGCC 3',
622
'Sfr274I' => 'CTCGAG 1',
623
'Sfr303I' => 'CCGCGG 4',
624
'SfuI' => 'TTCGAA 2',
625
'SgfI' => 'GCGATCGC 5',
626
'SgrAI' => 'CRCCGGYG 2',
627
'SgrBI' => 'CCGCGG 4',
629
'SlaI' => 'CTCGAG 1',
630
'SmaI' => 'CCCGGG 3',
631
'SmiI' => 'ATTTAAAT 4',
632
'SmiMI' => 'CAYNNNNRTG 5',
633
'SmlI' => 'CTYRAG 1',
634
'SnaBI' => 'TACGTA 3',
635
'SpaHI' => 'GCATGC 5',
636
'SpeI' => 'ACTAGT 1',
637
'SphI' => 'GCATGC 5',
638
'SplI' => 'CGTACG 1',
639
'SrfI' => 'GCCCGGGC 4',
641
'Sse232I' => 'CGCCGGCG 2',
642
'Sse8387I' => 'CCTGCAGG 6',
643
'Sse8647I' => 'AGGWCCT 2',
644
'SseBI' => 'AGGCCT 3',
645
'SspI' => 'AATATT 3',
646
'SspBI' => 'TGTACA 1',
647
'SstI' => 'GAGCTC 5',
648
'SstII' => 'CCGCGG 4',
649
'StuI' => 'AGGCCT 3',
650
'StyI' => 'CCWWGG 1',
651
'SunI' => 'CGTACG 1',
652
'SwaI' => 'ATTTAAAT 4',
657
'TatI' => 'WGTACW 1',
659
'TelI' => 'GACNNNGTC 4',
662
'TliI' => 'CTCGAG 1',
667
'Tsp45I' => 'GTSAC 0',
668
'Tsp509I' => 'AATT 0',
669
'Tsp4CI' => 'ACNGT 3',
671
'Tth111I' => 'GACNNNGTC 4',
672
'TthHB8I' => 'TCGA 1',
674
'Van91I' => 'CCANNNNNTGG 7',
675
'Vha464I' => 'CTTAAG 1',
676
'VneI' => 'GTGCAC 1',
677
'VpaK11AI' => 'GGWCC 0',
678
'VpaK11BI' => 'GGWCC 1',
679
'VspI' => 'ATTAAT 2',
680
'XagI' => 'CCTNNNNNAGG 5',
681
'XapI' => 'RAATTY 1',
682
'XbaI' => 'TCTAGA 1',
683
'XceI' => 'RCATGY 5',
684
'XcmI' => 'CCANNNNNNNNNTGG 8',
685
'XhoI' => 'CTCGAG 1',
686
'XhoII' => 'RGATCY 1',
687
'XmaI' => 'CCCGGG 1',
688
'XmaIII' => 'CGGCCG 1',
689
'XmaCI' => 'CCCGGG 1',
690
'XmaJI' => 'CCTAGG 1',
691
'XmiI' => 'GTMKAC 2',
692
'XmnI' => 'GAANNNNTTC 5',
694
'ZhoI' => 'ATCGAT 2',
695
'ZraI' => 'GACGTC 3',
696
'Zsp2I' => 'ATGCAT 5',
699
@RE_available = sort keys %RE;
705
Purpose : Initializes the RestrictionEnzyme object and calls
706
: superclass constructor last (Bio:Seq.pm).
708
Argument : Parameters passed to new()
709
Comments : A RestrictionEnzyme object manages its recognition sequence
710
: as a Bio::PrimarySeq object.
712
See Also : L<_make_custom>(), L<_make_standard>(), L<Bio::PrimarySeq.pm::_initialize()>
719
my($class, @args) = @_;
720
my $self = $class->SUPER::new(@args);
721
$self->warn("Use of Bio::Tools::RestrictionEnzyme is deprecated".
722
"Use Bio::Restriction classes instead");
723
my ($name,$make) = $self->_rearrange([qw(NAME MAKE)],@args);
725
$name && $self->name($name);
727
if(defined $make && $make eq 'custom') {
728
%data = $self->_make_custom($name);
730
%data = $self->_make_standard($name);
732
$self->{'_seq'} = new Bio::PrimarySeq(%data,
733
-VERBOSE =>$self->verbose,
740
#=head1 _make_standard
742
# Title : _make_standard
743
# Usage : n/a; automatically called by _initialize()
744
# Purpose : Permits custom RE object construction from name.
746
# Returns : Hash containing named parameters for Bio::PrimarySeq.pm constructor.
747
# Argument : String containing string with special syntax.
748
# Throws : Exception if the requested enzyme name is unavailable.
749
# : NOTE: Case sensitive.
751
#See Also : L<Bio::PrimarySeq::_initialize()|Bio::PrimarySeq>, L<_make_custom()|_make_custom>
758
my($self, $name) = @_;
760
$name =~ s/^\s+|\s+$//g;
762
$self->is_available($name) ||
763
$self->throw("Unavailable or undefined enzyme: $name (Note: CASE SENSITIVE)\n" .
764
"Currently available enzymes: \n@RE_available\n");
766
my @data = split( ' ', $RE{$name});
768
$dat{-SEQ} = $data[0];
769
$dat{-NAME} = $dat{-ID}= $name;
770
$self->{'_cuts_after'} = $data[1];
778
# Title : _make_custom
779
# Usage : n/a; automatically called by _initialize()
780
# Purpose : Permits custom RE object construction from strings
781
# : such as 'EcoRI--G^AATTC' as the name of the enzyme.
782
# Returns : Hash containing named parameters for Bio::PrimarySeq.pm constructor.
783
# Argument : String containing string with special syntax.
784
# Throws : Exception if the string has bad syntax.
785
# : Warning if the string did not specify cut position.
786
# : Places cut site after 5'-most position.
788
#See Also : L<Bio::PrimarySeq::_initialize()|Bio::PrimarySeq>
796
my($self, $name) = @_;
799
my @parts = split '--', $name;
801
$dat{-NAME} = $dat{-ID} = $parts[0];
802
$self->name($parts[0]); ## Reset name
804
$parts[1] || return $self->throw("Undefined recognition site for $parts[0].",
805
"Use this syntax: EcoRV--GAT^ATC");
806
## Determine the cuts_after point.
807
my $cut_index = index $parts[1], '^';
808
if( $cut_index <0) { $cut_index = 0;
809
$self->warn("Unknown cut position for $parts[0]. Assuming position 0\n" .
810
"Use carat to specify cut position (e.g., G^AATTC)"); }
811
$self->{'_cuts_after'} = $cut_index;
813
## Save the recognition sequence after removing the '^'
814
$parts[1] =~ s/\^//g;
815
$dat{-SEQ} = $parts[1];
823
Usage : $num = $re->cuts_after();
824
Purpose : Sets/Gets an integer indicating the position of cleavage
825
: relative to the 5' end of the recognition sequence.
827
Argument : Integer (optional)
828
Throws : Exception if argument is non-numeric.
830
Comments : This method is only needed to change the cuts at
831
: position. This data is automatically set during
834
See Also : L<_make_standard()|_make_standard>, L<_make_custom()|_make_custom>
843
if(@_) { my $num = shift;
844
if($num == 0 and $num ne '0') {
845
$self->throw("The cuts_after position be an integer ($num)");
847
$self->{'_cuts_after'} = $num;
849
$self->{'_cuts_after'};
858
Purpose : Gets the recognition sequence for the enzyme.
859
Example : $seq_string = $re->site();
860
Returns : String containing recognition sequence indicating
861
: cleavage site as in 'G^AATTC'.
864
Comments : If you want a simple string representing the site without
865
any '^', use the string() method.
867
See Also : L<string()|string>
875
my $seq = $self->seq;
876
my $cuts_after = $self->cuts_after;
877
if($cuts_after > 0) {
878
if( $cuts_after >= $seq->length) {
879
return $seq->seq.'^';
881
return $seq->subseq(1, $self->cuts_after).'^'.$seq->subseq($self->cuts_after+1, $seq->length);
893
Purpose : Get the Bio::PrimarySeq.pm-derived object representing
894
: the recognition sequence
899
See Also : L<string()|string>, L<revcom()|revcom>
904
sub seq { my $self = shift; $self->{'_seq'}; }
912
Usage : $re->string();
913
Purpose : Get a string representing the recognition sequence.
914
Returns : String. Does NOT contain a '^' representing the cut location
915
as returned by the site() method
918
Comments : Delegates to the Bio::PrimarySeq-derived object.
920
See Also : L<seq()|seq>, L<site()|site>, L<revcom()|revcom>
925
sub string { my $self = shift; $self->{'_seq'}->seq; }
933
Usage : $re->revcom();
934
Purpose : Get a string representing the reverse complement of
935
: the recognition sequence.
939
Comments : Delegates to the Bio::PrimarySeq.pm-derived object, but needs to
940
get out the string from it, as now Bio::PrimarySeq->revcom makes a
941
Bio::PrimarySeq object
943
See Also : L<seq()|seq>, L<string()|string>
948
sub revcom { my $self = shift; $self->{'_seq'}->revcom->seq(); }
956
Usage : $re->cut_seq(<sequence object>);
957
Purpose : Conceptually cut or "digest" a DNA sequence with the given enzyme.
958
Example : $string = $re->cut_seq(<sequence object>);
959
Returns : List of strings containing the resulting fragments.
960
Argument : Reference to a Bio::PrimarySeq.pm-derived object.
961
Throws : Exception if argument is not an object.
962
: (Does not yet verify that it is derived from Bio::PrimarySeq.pm.)
963
Comments : Strategy relies on Perl's built-in split() function.
964
: Since split removes the recognition pattern, the resulting
965
: fragments are repaired after split()-ing.
966
: A side-effect of this is that for sites with ambiguous
967
: recognition sequence (i.e., containing N), the fragments
968
: will contain ambiguity characters instead of AGCT.
970
: There is currently no support for partial digestions.
971
: There is currently no support for circular sequences.
972
: (This should just involve merging the first and last frag
973
: if $seqObj->is_circular returns true).
981
my( $self, $seqObj) = @_;
983
! $seqObj->isa('Bio::PrimarySeqI') ) {
984
$self->throw( "Can't cut sequence. Missing or invalid object".
988
my $cuts_after = $self->{'_cuts_after'};
989
my ($site_3prime_seq, $site_5prime_seq);
990
my $reSeq = $self->seq;
991
if($cuts_after == 0) {
992
$site_3prime_seq = '';
993
$site_5prime_seq = $reSeq->seq();
994
} elsif($cuts_after == $reSeq->length) {
995
$site_3prime_seq = $reSeq->seq();
996
$site_5prime_seq = '';
998
$site_3prime_seq = $reSeq->subseq(1, $self->{'_cuts_after'});
999
$site_5prime_seq = $reSeq->subseq($self->{'_cuts_after'}+1, $reSeq->length);
1002
$self->debug("3' site: $site_3prime_seq\n5' site: $site_5prime_seq\n");
1005
my $seq = uc $self->_expanded_string;
1007
if(!$self->palindromic and $self->name ne 'N') {
1008
my $revseq = $self->_expanded_string( $reSeq->revcom->seq() );
1009
$seq .= '|'.uc($revseq);
1011
$self->debug(__PACKAGE__, ": site seq: $seq\n");
1012
$self->debug(__PACKAGE__, ": splitting ", $reSeq->seq, "\n");
1013
@re_frags = split(/$seq/i, $seqObj->seq);
1015
$self->debug(__PACKAGE__, ": cut_seq, ", scalar(@re_frags), " fragments.\n");
1017
## Re-attach the split recognition site back to the frags
1018
## since perl zapped them in the split() call.
1020
my $numFrags = scalar @re_frags;
1021
for($i=0; $i<$numFrags; $i++) {
1022
$i < $#re_frags and $re_frags[$i] = $re_frags[$i].$site_3prime_seq;
1023
$i > 0 and $re_frags[$i] = $site_5prime_seq.$re_frags[$i];
1028
=head1 cut_locations
1030
Title : cut_locations
1031
Usage : my $locations = $re->cut_locations(<sequence_object>);
1032
Purpose : Report the location of the recognition site(s) within
1033
: an input sequence.
1034
Example : my $locations = $re->annotate_seq($seqObj);
1035
Returns : Arrayref of starting locations where enzyme would cut
1036
Argument : Reference to a Bio::PrimarySeqI-derived sequence object.
1045
my($self, $seqobj) = @_;
1047
my $site = $self->_expanded_string;
1048
my $seq = $seqobj->seq;
1051
while( $seq =~ /($site)/ig ) {
1052
# $` is preceding string before pattern so length returns position
1053
push @locations, length($`);
1058
# Purpose : Expand nucleotide ambiguity codes to their representative letters
1059
# Argument: (optional) the string to be expanded. If not supplied, used
1060
# the string returned by $self->string().
1062
sub _expanded_string {
1063
my ($self, $str) = @_;
1065
$str ||= $self->string;
1067
if( $self->name ne 'N' ) {
1069
$str =~ s/R/\[AG\]/g;
1070
$str =~ s/Y/\[CT\]/g;
1071
$str =~ s/S/\[GC\]/g;
1072
$str =~ s/W/\[AT\]/g;
1073
$str =~ s/M/\[AC\]/g;
1074
$str =~ s/K/\[TG\]/g;
1075
$str =~ s/B/\[CGT\]/g;
1076
$str =~ s/D/\[AGT\]/g;
1077
$str =~ s/H/\[ACT\]/g;
1078
$str =~ s/V/\[ACG\]/g;
1086
Title : annotate_seq
1087
Usage : $re->annotate_seq(<sequence_object>);
1088
Purpose : Identify the location of the recognition site(s) within
1089
: an input sequence. Uses HTML.
1090
Example : $annot_seq = $re->annotate_seq($seqObj);
1091
Returns : String containing the annotated sequence.
1092
Argument : Reference to a Bio::PrimarySeq.pm-derived sequence object.
1094
Comments : The annotated sequence must be viewed with a web
1095
: browser to see the location(s) of the recognition site(s).
1102
my($self, $seqObj) = @_;
1104
my $site = $self->_expanded_string;
1105
my $seq = $seqObj->seq;
1107
$seq =~ s|$site|<b>$site</b>|g;
1115
Usage : $re->palindromic();
1116
Purpose : Determines if the recognition sequence is palindromic
1117
: for the current restriction enzyme.
1122
Comments : A palindromic site (EcoRI): 5-GAATTC-3
1131
$self->string eq $self->revcom;
1138
Title : is_available
1139
Usage : $re->is_available(<string containing name of enzyme>);
1140
Purpose : Determine if an enzyme is available (to this module).
1141
: (see the package lexical %RE).
1142
Example : $re->is_available('EcoRI');
1143
: &Bio::Tools::RestrictionEnzyme::is_available($object,'EcoRI');
1147
Comments : This method does NOT give information about
1148
: commercial availability (yet).
1149
: Enzyme names are CASE SENSITIVE.
1151
See Also : L<available_list()|available_list>
1158
my($self,$name) = @_;
1165
my($self,$name) = @_;
1166
$self->warn('available() is deprecated; use is_available() instead');
1167
$self->is_available($name);
1174
Usage : $obj->name($newval)
1177
Returns : value of name
1178
Args : newvalue (optional)
1184
my ($obj,$value) = @_;
1185
if( defined $value) {
1186
$obj->{'name'} = $value;
1188
return $obj->{'name'};
1192
=head1 available_list
1194
Title : available_list
1195
Usage : $re->available_list([<integer>]);
1196
Purpose : Retrieve a list of currently available enzymes.
1197
Example : @all = $re->available_list(); ## All enzymes
1198
: @six_cutters = $re->available_list(6); ## All 6-cutters
1199
Returns : List of strings
1200
Argument : Integer (optional)
1202
Comments : This method may be more appropriate for a REData.pm class.
1204
See Also : L<is_available()|is_available>
1208
#-------------------
1209
sub available_list {
1210
#-------------------
1211
my($self,$size) = @_;
1214
$size eq 'all' and return @RE_available;
1217
foreach (@RE_available) {
1218
@data = split /\s/, $RE{$_};
1219
if(length $data[0] == $size) {