You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

149 lines
4.2 KiB

  1. /**
  2. Implements bigram matching.
  3. */
  4. #include <stdlib.h>
  5. #include <stdio.h>
  6. #include "Bigram.h"
  7. int drop_cost = 1; // Cost of dropping another bigram
  8. int space_cost = 0; // Cost of dropping a bigram starting with space
  9. int displace_cost = 1; // Cost of displacing bigram other than the first
  10. int threshold_cost = 20; // Threshold for keeping option
  11. static Placement *Placement_create(int i,int j) {
  12. Placement *p = MALLOC(Placement);
  13. p->link.next = 0;
  14. p->i = i;
  15. p->j = j;
  16. return p;
  17. }
  18. static Placement *Placement_copy(Placement *in) {
  19. PlacementList list = { 0, 0 };
  20. for ( ; in; in = in->next ) {
  21. List_append( &list, (IListItem*) Placement_create( in->i, in->j ) );
  22. }
  23. return (Placement*) list.first;
  24. }
  25. static Placement *Placement_rcopy(Placement *in) {
  26. PlacementList list = { 0, 0 };
  27. for ( ; in; in = in->next ) {
  28. List_prepend( &list, (IListItem*) Placement_create( in->i, in->j ) );
  29. }
  30. return (Placement*) list.first;
  31. }
  32. static PlacementOptions *PlacementOptions_create(Placement *options,int rev) {
  33. PlacementOptions *p = MALLOC(PlacementOptions);
  34. p->link.next = 0;
  35. p->places = rev? Placement_rcopy( options ) : Placement_copy( options );
  36. p->value = 0;
  37. return p;
  38. }
  39. static PlacementOptionsList *PlacementOptionsList_create() {
  40. PlacementOptionsList *p = MALLOC( List );
  41. p->first = 0;
  42. p->last = 0;
  43. return p;
  44. }
  45. static void Placement_free(Placement *list) {
  46. Placement *p;
  47. while ( ( p = list ) ) {
  48. list = p->next;
  49. free( p );
  50. }
  51. }
  52. void PlacementOptionsList_free(PlacementOptionsList *list) {
  53. IListItem *p;
  54. while ( ( p = list->first ) ) {
  55. list->first = p->next;
  56. Placement_free( ((PlacementOptions*)p)->places );
  57. free( p );
  58. }
  59. free( list );
  60. }
  61. static int bigram_worse(IListItem *a,IListItem *b) {
  62. PlacementOptions *ap = (PlacementOptions*)a;
  63. PlacementOptions *bp = (PlacementOptions*)b;
  64. return ap->value < bp->value;
  65. }
  66. static void bigram_traverse_options(
  67. PlacementOptionsList *out,PlacementOptions *bigrams,
  68. int cost,int at,Placement *prev ) {
  69. if ( bigrams ) {
  70. Placement *p;
  71. int c = ( bigrams->value / 256 ) == ' ';
  72. c |= ( bigrams->value & 0xff ) == ' ';
  73. c = c? space_cost : drop_cost;
  74. bigram_traverse_options( out, bigrams->next, cost + c, at, prev );
  75. c = displace_cost;
  76. for ( p = bigrams->places; p; p = p->next ) {
  77. if ( p->j > at ) {
  78. Placement x = { .next = prev, .i = p->i, .j = p->j };
  79. bigram_traverse_options(
  80. out, bigrams->next,
  81. cost + ((at < 0)? 0 : ( (p->j - at - 1) * c )),
  82. p->j, &x );
  83. }
  84. }
  85. } else {
  86. // capture a reversed copy of current placement path at the
  87. // given cost
  88. if ( cost < threshold_cost ) {
  89. PlacementOptions *op = PlacementOptions_create( prev, 1 );
  90. op->value = cost;
  91. List_insert( out, (IListItem*) op, bigram_worse );
  92. }
  93. }
  94. }
  95. /**
  96. Determine all the placements of the bigrams in 'text' into 'line'.
  97. Returns an ordered list of the placement options by ascending
  98. "cost".
  99. Each PlacementOptions record comprises the cost and the chain of
  100. placements of all or some of the 'text' bigrams in the line, by
  101. order of increasing byte position index. Each such placement record
  102. has the bigram as its 'i' field (256*c1+c2) and the position as its
  103. 'j' field.
  104. */
  105. PlacementOptionsList *bigram_places(char *text,char *line) {
  106. PlaceMap map = { 0 };
  107. char *b;
  108. int prev = 0;
  109. for ( b = line; *b; b++ ) {
  110. uint i = BIGRAM( b );
  111. if ( map[ i ].prev == 0 ) {
  112. map[ i ].prev = -1 - prev;
  113. prev = i;
  114. }
  115. List_append( (List*) &map[ i ].list,
  116. (IListItem*) Placement_create( i, b - line ) );
  117. }
  118. PlacementOptionsList *places = PlacementOptionsList_create();
  119. for ( b = text; *b && *(b+1); b++ ) {
  120. uint i = BIGRAM( b );
  121. Placement *p = (Placement*) map[ i ].list.first;
  122. PlacementOptions *po = PlacementOptions_create( p, 0 );
  123. po->value = i; // 'value' is bigram for traversal
  124. List_append( places, (IListItem*) po );
  125. }
  126. while ( prev ) {
  127. Placement_free( (Placement*) map[ prev ].list.first );
  128. prev = -1 - map[ prev ].prev;
  129. }
  130. PlacementOptionsList *out = PlacementOptionsList_create();
  131. bigram_traverse_options( out, (PlacementOptions*)places->first, 0, -1, 0 );
  132. PlacementOptionsList_free( places );
  133. return out;
  134. }