Profile of MARC/Charset.pm

Filename	/usr/share/perl5/MARC/Charset.pm
Statements	Executed 34 statements in 3.05ms

Subroutines
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
1	1	1	4.30ms	10.2ms	MARC::Charset::::BEGIN@11MARC::Charset::BEGIN@11
1	1	1	2.30ms	7.85ms	MARC::Charset::::BEGIN@12MARC::Charset::BEGIN@12
1	1	1	1.09ms	2.36ms	MARC::Charset::::BEGIN@10MARC::Charset::BEGIN@10
1	1	1	902µs	32.9ms	MARC::Charset::::BEGIN@13MARC::Charset::BEGIN@13
1	1	1	22µs	28µs	MARC::Charset::::BEGIN@4MARC::Charset::BEGIN@4
1	1	1	18µs	118µs	MARC::Charset::::BEGIN@7MARC::Charset::BEGIN@7
1	1	1	16µs	462µs	MARC::Charset::::BEGIN@14MARC::Charset::BEGIN@14
1	1	1	15µs	45µs	MARC::Charset::::BEGIN@5MARC::Charset::BEGIN@5
0	0	0	0s	0s	MARC::Charset::::_process_escapeMARC::Charset::_process_escape
0	0	0	0s	0s	MARC::Charset::::assume_encodingMARC::Charset::assume_encoding
0	0	0	0s	0s	MARC::Charset::::assume_unicodeMARC::Charset::assume_unicode
0	0	0	0s	0s	MARC::Charset::::ignore_errorsMARC::Charset::ignore_errors
0	0	0	0s	0s	MARC::Charset::::marc8_to_utf8MARC::Charset::marc8_to_utf8
0	0	0	0s	0s	MARC::Charset::::reset_charsetsMARC::Charset::reset_charsets
0	0	0	0s	0s	MARC::Charset::::utf8_to_marc8MARC::Charset::utf8_to_marc8

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
1					package MARC::Charset;
2
3	1	2µs			our $VERSION = '0.98';
4	3	40µs	2	34µs	# spent 28µs (22+6) within MARC::Charset::BEGIN@4 which was called: # once (22µs+6µs) by MARC::File::XML::BEGIN@11 at line 4 use strict; # spent 28µs making 1 call to MARC::Charset::BEGIN@4 # spent 6µs making 1 call to strict::import
5	3	43µs	2	75µs	# spent 45µs (15+30) within MARC::Charset::BEGIN@5 which was called: # once (15µs+30µs) by MARC::File::XML::BEGIN@11 at line 5 use warnings; # spent 45µs making 1 call to MARC::Charset::BEGIN@5 # spent 30µs making 1 call to warnings::import
6
7	3	64µs	2	218µs	# spent 118µs (18+100) within MARC::Charset::BEGIN@7 which was called: # once (18µs+100µs) by MARC::File::XML::BEGIN@11 at line 7 use base qw(Exporter); # spent 118µs making 1 call to MARC::Charset::BEGIN@7 # spent 100µs making 1 call to base::import
8	1	3µs			our @EXPORT_OK = qw(marc8_to_utf8 utf8_to_marc8);
9
10	3	206µs	2	2.45ms	# spent 2.36ms (1.09+1.27) within MARC::Charset::BEGIN@10 which was called: # once (1.09ms+1.27ms) by MARC::File::XML::BEGIN@11 at line 10 use Unicode::Normalize; # spent 2.36ms making 1 call to MARC::Charset::BEGIN@10 # spent 88µs making 1 call to Exporter::import
11	3	190µs	2	10.4ms	# spent 10.2ms (4.30+5.90) within MARC::Charset::BEGIN@11 which was called: # once (4.30ms+5.90ms) by MARC::File::XML::BEGIN@11 at line 11 use Encode 'decode'; # spent 10.2ms making 1 call to MARC::Charset::BEGIN@11 # spent 245µs making 1 call to Exporter::import
12	3	175µs	2	10.4ms	# spent 7.85ms (2.30+5.56) within MARC::Charset::BEGIN@12 which was called: # once (2.30ms+5.56ms) by MARC::File::XML::BEGIN@11 at line 12 use charnames ':full'; # spent 7.85ms making 1 call to MARC::Charset::BEGIN@12 # spent 2.51ms making 1 call to charnames::import
13	3	197µs	1	32.9ms	# spent 32.9ms (902µs+32.0) within MARC::Charset::BEGIN@13 which was called: # once (902µs+32.0ms) by MARC::File::XML::BEGIN@11 at line 13 use MARC::Charset::Table; # spent 32.9ms making 1 call to MARC::Charset::BEGIN@13
14	3	2.10ms	2	908µs	# spent 462µs (16+446) within MARC::Charset::BEGIN@14 which was called: # once (16µs+446µs) by MARC::File::XML::BEGIN@11 at line 14 use MARC::Charset::Constants qw(:all); # spent 462µs making 1 call to MARC::Charset::BEGIN@14 # spent 446µs making 1 call to Exporter::import
15
16					=head1 NAME
17
- -
43					# get the mapping table
44	1	5µs	1	128µs	our $table = MARC::Charset::Table->new(); # spent 128µs making 1 call to MARC::Charset::Table::new
45
46					# set default character sets
47					# these are viewable at the package level
48					# in case someone wants to set them
49	1	2µs			our $DEFAULT_G0 = ASCII_DEFAULT;
50	1	1µs			our $DEFAULT_G1 = EXTENDED_LATIN;
51
52					=head2 ignore_errors()
53
- -
66	1	1µs			our $_ignore_errors = 0;
67					sub ignore_errors {
68					my ($self,$i) = @_;
69					$_ignore_errors = $i if (defined($i));
70					return $_ignore_errors;
71					}
72
73
74					=head2 assume_unicode()
75
- -
89	1	1µs			our $_assume = '';
90					sub assume_unicode {
91					my ($self,$i) = @_;
92					$_assume = 'utf8' if (defined($i) and $i);
93					return 1 if ($_assume eq 'utf8');
94					}
95
96
97					=head2 assume_encoding()
98
- -
111					sub assume_encoding {
112					my ($self,$i) = @_;
113					$_assume = $i if (defined($i));
114					return $_assume;
115					}
116
117
118					# place holders for working graphical character sets
119	1	1µs			my $G0;
120	1	1µs			my $G1;
121
122					=head2 marc8_to_utf8()
123
- -
142					sub marc8_to_utf8
143					{
144					my ($marc8, $ignore_errors) = @_;
145					reset_charsets();
146
147					$ignore_errors = $_ignore_errors if (!defined($ignore_errors));
148
149					# holder for our utf8
150					my $utf8 = '';
151
152					my $index = 0;
153					my $length = length($marc8);
154					my $combining = '';
155					CHAR_LOOP: while ($index < $length)
156					{
157					# whitespace, line feeds and carriage returns just get added on unmolested
158					if (substr($marc8, $index, 1) =~ m/(\s+\|\x0A+\|\x0D+)/so)
159					{
160					$utf8 .= $1;
161					$index += 1;
162					next CHAR_LOOP;
163					}
164
165					# look for any escape sequences
166					my $new_index = _process_escape(\$marc8, $index, $length);
167					if ($new_index > $index)
168					{
169					$index = $new_index;
170					next CHAR_LOOP;
171					}
172
173					my $found;
174					CHARSET_LOOP: foreach my $charset ($G0, $G1)
175					{
176
177					# cjk characters are a string of three chars
178					my $char_size = $charset eq CJK ? 3 : 1;
179
180					# extract the next code point to examine
181					my $chunk = substr($marc8, $index, $char_size);
182
183					# look up the character to see if it's in our mapping
184					my $code = $table->lookup_by_marc8($charset, $chunk);
185
186					# try the next character set if no mapping was found
187					next CHARSET_LOOP if ! $code;
188					$found = 1;
189
190					# gobble up all combining characters for appending later
191					# this is necessary because combinging characters precede
192					# the character they modifiy in MARC-8, whereas they follow
193					# the character they modify in UTF-8.
194					if ($code->is_combining())
195					{
196					$combining .= $code->char_value();
197					}
198					else
199					{
200					$utf8 .= $code->char_value() . $combining;
201					$combining = '';
202					}
203
204					$index += $char_size;
205					next CHAR_LOOP;
206					}
207
208					if (!$found)
209					{
210					warn(sprintf("no mapping found for [0x\%X] at position $index in $marc8 ".
211					"g0=".MARC::Charset::Constants::charset_name($G0) . " " .
212					"g1=".MARC::Charset::Constants::charset_name($G1), unpack('C',substr($marc8,$index,1))));
213					if (!$ignore_errors)
214					{
215					reset_charsets();
216					return;
217					}
218					if ($_assume)
219					{
220					reset_charsets();
221					return NFC(decode($_assume => $marc8));
222					}
223					$index += 1;
224					}
225
226					}
227
228					# return the utf8
229					reset_charsets();
230					return $utf8;
231					}
232
- -
235					=head2 utf8_to_marc8()
236
- -
254					sub utf8_to_marc8
255					{
256					my ($utf8, $ignore_errors) = @_;
257					reset_charsets();
258
259					$ignore_errors = $_ignore_errors if (!defined($ignore_errors));
260
261					# decompose combined characters
262					$utf8 = NFD($utf8);
263
264					my $len = length($utf8);
265					my $marc8 = '';
266					for (my $i=0; $i<$len; $i++)
267					{
268					my $slice = substr($utf8, $i, 1);
269
270					# spaces are copied from utf8 into marc8
271					if ($slice eq ' ')
272					{
273					$marc8 .= ' ';
274					next;
275					}
276
277					# try to find the code point in our mapping table
278					my $code = $table->lookup_by_utf8($slice);
279
280					if (! $code)
281					{
282					warn("no mapping found at position $i in $utf8");
283					reset_charsets() and return unless $ignore_errors;
284					}
285
286					# if it's a combining character move it around
287					if ($code->is_combining())
288					{
289					my $prev = chop($marc8);
290					$marc8 .= $code->marc_value() . $prev;
291					next;
292					}
293
294					# look to see if we need to escape to a new G0 charset
295					my $charset_value = $code->charset_value();
296
297					if ($code->default_charset_group() eq 'G0'
298					and $G0 ne $charset_value)
299					{
300					if ($G0 eq ASCII_DEFAULT and $charset_value eq BASIC_LATIN)
301					{
302					# don't bother escaping, they're functionally the same
303					}
304					else
305					{
306					$marc8 .= $code->get_escape();
307					$G0 = $charset_value;
308					}
309					}
310
311					# look to see if we need to escape to a new G1 charset
312					elsif ($code->default_charset_group() eq 'G1'
313					and $G1 ne $charset_value)
314					{
315					$marc8 .= $code->get_escape();
316					$G1 = $charset_value;
317					}
318
319					$marc8 .= $code->marc_value();
320					}
321
322					# escape back to default G0 if necessary
323					if ($G0 ne $DEFAULT_G0)
324					{
325					if ($DEFAULT_G0 eq ASCII_DEFAULT) { $marc8 .= ESCAPE . ASCII_DEFAULT; }
326					elsif ($DEFAULT_G0 eq CJK) { $marc8 .= ESCAPE . MULTI_G0_A . CJK; }
327					else { $marc8 .= ESCAPE . SINGLE_G0_A . $DEFAULT_G0; }
328					}
329
330					# escape back to default G1 if necessary
331					if ($G1 ne $DEFAULT_G1)
332					{
333					if ($DEFAULT_G1 eq CJK) { $marc8 .= ESCAPE . MULTI_G1_A . $DEFAULT_G1; }
334					else { $marc8 .= ESCAPE . SINGLE_G1_A . $DEFAULT_G1; }
335					}
336
337					return $marc8;
338					}
339
- -
342					=head1 DEFAULT CHARACTER SETS
343
- -
377					sub _process_escape
378					{
379					## this stuff is kind of scary ... for an explanation of what is
380					## going on here check out the MARC-8 specs at LC.
381					## http://lcweb.loc.gov/marc/specifications/speccharmarc8.html
382					my ($str_ref, $left, $right) = @_;
383
384					# first char needs to be an escape or else this isn't an escape sequence
385					return $left unless substr($$str_ref, $left, 1) eq ESCAPE;
386
387					## if we don't have at least one character after the escape
388					## then this can't be a character escape sequence
389					return $left if ($left+1 >= $right);
390
391					## pull off the first escape
392					my $esc_char_1 = substr($$str_ref, $left+1, 1);
393
394					## the first method of escaping to small character sets
395					if ( $esc_char_1 eq GREEK_SYMBOLS
396					or $esc_char_1 eq SUBSCRIPTS
397					or $esc_char_1 eq SUPERSCRIPTS
398					or $esc_char_1 eq ASCII_DEFAULT)
399					{
400					$G0 = $esc_char_1;
401					return $left+2;
402					}
403
404					## the second more complicated method of escaping to bigger charsets
405					return $left if $left+2 >= $right;
406
407					my $esc_char_2 = substr($$str_ref, $left+2, 1);
408					my $esc_chars = $esc_char_1 . $esc_char_2;
409
410					if ($esc_char_1 eq SINGLE_G0_A
411					or $esc_char_1 eq SINGLE_G0_B)
412					{
413					$G0 = $esc_char_2;
414					return $left+3;
415					}
416
417					elsif ($esc_char_1 eq SINGLE_G1_A
418					or $esc_char_1 eq SINGLE_G1_B)
419					{
420					$G1 = $esc_char_2;
421					return $left+3;
422					}
423
424					elsif ( $esc_char_1 eq MULTI_G0_A ) {
425					$G0 = $esc_char_2;
426					return $left+3;
427					}
428
429					elsif ($esc_chars eq MULTI_G0_B
430					and ($left+3 < $right))
431					{
432					$G0 = substr($$str_ref, $left+3, 1);
433					return $left+4;
434					}
435
436					elsif (($esc_chars eq MULTI_G1_A or $esc_chars eq MULTI_G1_B)
437					and ($left + 3 < $right))
438					{
439					$G1 = substr($$str_ref, $left+3, 1);
440					return $left+4;
441					}
442
443					# we should never get here
444					warn("seem to have fallen through in _process_escape()");
445					return $left;
446					}
447
448					sub reset_charsets
449					{
450					$G0 = $DEFAULT_G0;
451					$G1 = $DEFAULT_G1;
452					}
453
454	1	12µs			1;