← Index
NYTProf Performance Profile   « block view • line view • sub view »
For conv.pl
  Run on Sun Nov 14 22:50:31 2010
Reported on Sun Nov 14 22:51:20 2010

Filename/usr/share/perl5/MARC/Charset.pm
StatementsExecuted 34 statements in 3.05ms
Subroutines
Calls P F Exclusive
Time
Inclusive
Time
Subroutine
1114.30ms10.2msMARC::Charset::::BEGIN@11MARC::Charset::BEGIN@11
1112.30ms7.85msMARC::Charset::::BEGIN@12MARC::Charset::BEGIN@12
1111.09ms2.36msMARC::Charset::::BEGIN@10MARC::Charset::BEGIN@10
111902µs32.9msMARC::Charset::::BEGIN@13MARC::Charset::BEGIN@13
11122µs28µsMARC::Charset::::BEGIN@4MARC::Charset::BEGIN@4
11118µs118µsMARC::Charset::::BEGIN@7MARC::Charset::BEGIN@7
11116µs462µsMARC::Charset::::BEGIN@14MARC::Charset::BEGIN@14
11115µs45µsMARC::Charset::::BEGIN@5MARC::Charset::BEGIN@5
0000s0sMARC::Charset::::_process_escapeMARC::Charset::_process_escape
0000s0sMARC::Charset::::assume_encodingMARC::Charset::assume_encoding
0000s0sMARC::Charset::::assume_unicodeMARC::Charset::assume_unicode
0000s0sMARC::Charset::::ignore_errorsMARC::Charset::ignore_errors
0000s0sMARC::Charset::::marc8_to_utf8MARC::Charset::marc8_to_utf8
0000s0sMARC::Charset::::reset_charsetsMARC::Charset::reset_charsets
0000s0sMARC::Charset::::utf8_to_marc8MARC::Charset::utf8_to_marc8
Call graph for these subroutines as a Graphviz dot language file.
Line State
ments
Time
on line
Calls Time
in subs
Code
1package MARC::Charset;
2
312µsour $VERSION = '0.98';
4340µs234µs
# spent 28µs (22+6) within MARC::Charset::BEGIN@4 which was called: # once (22µs+6µs) by MARC::File::XML::BEGIN@11 at line 4
use strict;
# spent 28µs making 1 call to MARC::Charset::BEGIN@4 # spent 6µs making 1 call to strict::import
5343µs275µs
# spent 45µs (15+30) within MARC::Charset::BEGIN@5 which was called: # once (15µs+30µs) by MARC::File::XML::BEGIN@11 at line 5
use warnings;
# spent 45µs making 1 call to MARC::Charset::BEGIN@5 # spent 30µs making 1 call to warnings::import
6
7364µs2218µs
# spent 118µs (18+100) within MARC::Charset::BEGIN@7 which was called: # once (18µs+100µs) by MARC::File::XML::BEGIN@11 at line 7
use base qw(Exporter);
# spent 118µs making 1 call to MARC::Charset::BEGIN@7 # spent 100µs making 1 call to base::import
813µsour @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8);
9
103206µs22.45ms
# spent 2.36ms (1.09+1.27) within MARC::Charset::BEGIN@10 which was called: # once (1.09ms+1.27ms) by MARC::File::XML::BEGIN@11 at line 10
use Unicode::Normalize;
# spent 2.36ms making 1 call to MARC::Charset::BEGIN@10 # spent 88µs making 1 call to Exporter::import
113190µs210.4ms
# spent 10.2ms (4.30+5.90) within MARC::Charset::BEGIN@11 which was called: # once (4.30ms+5.90ms) by MARC::File::XML::BEGIN@11 at line 11
use Encode 'decode';
# spent 10.2ms making 1 call to MARC::Charset::BEGIN@11 # spent 245µs making 1 call to Exporter::import
123175µs210.4ms
# spent 7.85ms (2.30+5.56) within MARC::Charset::BEGIN@12 which was called: # once (2.30ms+5.56ms) by MARC::File::XML::BEGIN@11 at line 12
use charnames ':full';
# spent 7.85ms making 1 call to MARC::Charset::BEGIN@12 # spent 2.51ms making 1 call to charnames::import
133197µs132.9ms
# spent 32.9ms (902µs+32.0) within MARC::Charset::BEGIN@13 which was called: # once (902µs+32.0ms) by MARC::File::XML::BEGIN@11 at line 13
use MARC::Charset::Table;
# spent 32.9ms making 1 call to MARC::Charset::BEGIN@13
1432.10ms2908µs
# spent 462µs (16+446) within MARC::Charset::BEGIN@14 which was called: # once (16µs+446µs) by MARC::File::XML::BEGIN@11 at line 14
use MARC::Charset::Constants qw(:all);
# spent 462µs making 1 call to MARC::Charset::BEGIN@14 # spent 446µs making 1 call to Exporter::import
15
16=head1 NAME
17
- -
43# get the mapping table
4415µs1128µsour $table = MARC::Charset::Table->new();
# spent 128µs making 1 call to MARC::Charset::Table::new
45
46# set default character sets
47# these are viewable at the package level
48# in case someone wants to set them
4912µsour $DEFAULT_G0 = ASCII_DEFAULT;
5011µsour $DEFAULT_G1 = EXTENDED_LATIN;
51
52=head2 ignore_errors()
53
- -
6611µsour $_ignore_errors = 0;
67sub ignore_errors {
68 my ($self,$i) = @_;
69 $_ignore_errors = $i if (defined($i));
70 return $_ignore_errors;
71}
72
73
74=head2 assume_unicode()
75
- -
8911µsour $_assume = '';
90sub assume_unicode {
91 my ($self,$i) = @_;
92 $_assume = 'utf8' if (defined($i) and $i);
93 return 1 if ($_assume eq 'utf8');
94}
95
96
97=head2 assume_encoding()
98
- -
111sub assume_encoding {
112 my ($self,$i) = @_;
113 $_assume = $i if (defined($i));
114 return $_assume;
115}
116
117
118# place holders for working graphical character sets
11911µsmy $G0;
12011µsmy $G1;
121
122=head2 marc8_to_utf8()
123
- -
142sub marc8_to_utf8
143{
144 my ($marc8, $ignore_errors) = @_;
145 reset_charsets();
146
147 $ignore_errors = $_ignore_errors if (!defined($ignore_errors));
148
149 # holder for our utf8
150 my $utf8 = '';
151
152 my $index = 0;
153 my $length = length($marc8);
154 my $combining = '';
155 CHAR_LOOP: while ($index < $length)
156 {
157 # whitespace, line feeds and carriage returns just get added on unmolested
158 if (substr($marc8, $index, 1) =~ m/(\s+|\x0A+|\x0D+)/so)
159 {
160 $utf8 .= $1;
161 $index += 1;
162 next CHAR_LOOP;
163 }
164
165 # look for any escape sequences
166 my $new_index = _process_escape(\$marc8, $index, $length);
167 if ($new_index > $index)
168 {
169 $index = $new_index;
170 next CHAR_LOOP;
171 }
172
173 my $found;
174 CHARSET_LOOP: foreach my $charset ($G0, $G1)
175 {
176
177 # cjk characters are a string of three chars
178 my $char_size = $charset eq CJK ? 3 : 1;
179
180 # extract the next code point to examine
181 my $chunk = substr($marc8, $index, $char_size);
182
183 # look up the character to see if it's in our mapping
184 my $code = $table->lookup_by_marc8($charset, $chunk);
185
186 # try the next character set if no mapping was found
187 next CHARSET_LOOP if ! $code;
188 $found = 1;
189
190 # gobble up all combining characters for appending later
191 # this is necessary because combinging characters precede
192 # the character they modifiy in MARC-8, whereas they follow
193 # the character they modify in UTF-8.
194 if ($code->is_combining())
195 {
196 $combining .= $code->char_value();
197 }
198 else
199 {
200 $utf8 .= $code->char_value() . $combining;
201 $combining = '';
202 }
203
204 $index += $char_size;
205 next CHAR_LOOP;
206 }
207
208 if (!$found)
209 {
210 warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ".
211 "g0=".MARC::Charset::Constants::charset_name($G0) . " " .
212 "g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1))));
213 if (!$ignore_errors)
214 {
215 reset_charsets();
216 return;
217 }
218 if ($_assume)
219 {
220 reset_charsets();
221 return NFC(decode($_assume => $marc8));
222 }
223 $index += 1;
224 }
225
226 }
227
228 # return the utf8
229 reset_charsets();
230 return $utf8;
231}
232
- -
235=head2 utf8_to_marc8()
236
- -
254sub utf8_to_marc8
255{
256 my ($utf8, $ignore_errors) = @_;
257 reset_charsets();
258
259 $ignore_errors = $_ignore_errors if (!defined($ignore_errors));
260
261 # decompose combined characters
262 $utf8 = NFD($utf8);
263
264 my $len = length($utf8);
265 my $marc8 = '';
266 for (my $i=0; $i<$len; $i++)
267 {
268 my $slice = substr($utf8, $i, 1);
269
270 # spaces are copied from utf8 into marc8
271 if ($slice eq ' ')
272 {
273 $marc8 .= ' ';
274 next;
275 }
276
277 # try to find the code point in our mapping table
278 my $code = $table->lookup_by_utf8($slice);
279
280 if (! $code)
281 {
282 warn("no mapping found at position $i in $utf8");
283 reset_charsets() and return unless $ignore_errors;
284 }
285
286 # if it's a combining character move it around
287 if ($code->is_combining())
288 {
289 my $prev = chop($marc8);
290 $marc8 .= $code->marc_value() . $prev;
291 next;
292 }
293
294 # look to see if we need to escape to a new G0 charset
295 my $charset_value = $code->charset_value();
296
297 if ($code->default_charset_group() eq 'G0'
298 and $G0 ne $charset_value)
299 {
300 if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN)
301 {
302 # don't bother escaping, they're functionally the same
303 }
304 else
305 {
306 $marc8 .= $code->get_escape();
307 $G0 = $charset_value;
308 }
309 }
310
311 # look to see if we need to escape to a new G1 charset
312 elsif ($code->default_charset_group() eq 'G1'
313 and $G1 ne $charset_value)
314 {
315 $marc8 .= $code->get_escape();
316 $G1 = $charset_value;
317 }
318
319 $marc8 .= $code->marc_value();
320 }
321
322 # escape back to default G0 if necessary
323 if ($G0 ne $DEFAULT_G0)
324 {
325 if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; }
326 elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; }
327 else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; }
328 }
329
330 # escape back to default G1 if necessary
331 if ($G1 ne $DEFAULT_G1)
332 {
333 if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; }
334 else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; }
335 }
336
337 return $marc8;
338}
339
- -
342=head1 DEFAULT CHARACTER SETS
343
- -
377sub _process_escape
378{
379 ## this stuff is kind of scary ... for an explanation of what is
380 ## going on here check out the MARC-8 specs at LC.
381 ## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html
382 my ($str_ref, $left, $right) = @_;
383
384 # first char needs to be an escape or else this isn't an escape sequence
385 return $left unless substr($$str_ref, $left, 1) eq ESCAPE;
386
387 ## if we don't have at least one character after the escape
388 ## then this can't be a character escape sequence
389 return $left if ($left+1 >= $right);
390
391 ## pull off the first escape
392 my $esc_char_1 = substr($$str_ref, $left+1, 1);
393
394 ## the first method of escaping to small character sets
395 if ( $esc_char_1 eq GREEK_SYMBOLS
396 or $esc_char_1 eq SUBSCRIPTS
397 or $esc_char_1 eq SUPERSCRIPTS
398 or $esc_char_1 eq ASCII_DEFAULT)
399 {
400 $G0 = $esc_char_1;
401 return $left+2;
402 }
403
404 ## the second more complicated method of escaping to bigger charsets
405 return $left if $left+2 >= $right;
406
407 my $esc_char_2 = substr($$str_ref, $left+2, 1);
408 my $esc_chars = $esc_char_1 . $esc_char_2;
409
410 if ($esc_char_1 eq SINGLE_G0_A
411 or $esc_char_1 eq SINGLE_G0_B)
412 {
413 $G0 = $esc_char_2;
414 return $left+3;
415 }
416
417 elsif ($esc_char_1 eq SINGLE_G1_A
418 or $esc_char_1 eq SINGLE_G1_B)
419 {
420 $G1 = $esc_char_2;
421 return $left+3;
422 }
423
424 elsif ( $esc_char_1 eq MULTI_G0_A ) {
425 $G0 = $esc_char_2;
426 return $left+3;
427 }
428
429 elsif ($esc_chars eq MULTI_G0_B
430 and ($left+3 < $right))
431 {
432 $G0 = substr($$str_ref, $left+3, 1);
433 return $left+4;
434 }
435
436 elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B)
437 and ($left + 3 < $right))
438 {
439 $G1 = substr($$str_ref, $left+3, 1);
440 return $left+4;
441 }
442
443 # we should never get here
444 warn("seem to have fallen through in _process_escape()");
445 return $left;
446}
447
448sub reset_charsets
449{
450 $G0 = $DEFAULT_G0;
451 $G1 = $DEFAULT_G1;
452}
453
454112µs1;