Browse Source

initial version

suites/experimental
Ralph Rönnquist 5 months ago
commit
594f86456b
7 changed files with 355 additions and 0 deletions
  1. +39
    -0
      Bigram.h
  2. +18
    -0
      List.h
  3. +7
    -0
      Makefile
  4. +14
    -0
      README.adoc
  5. +148
    -0
      bigram.c
  6. +88
    -0
      bigrep.c
  7. +41
    -0
      list.c

+ 39
- 0
Bigram.h View File

@@ -0,0 +1,39 @@
#ifndef Bigram_H
#define Bigram_H

#include "List.h"

typedef struct _Placement {
union {
IListItem link;
struct _Placement *next;
};
int i;
int j;
} Placement;

typedef List PlacementList;

typedef struct _PlacementOptions {
union {
IListItem link;
struct _PlacementOptions *next;
};
int value;
Placement *places;
} PlacementOptions;

typedef List PlacementOptionsList;

typedef struct {
int prev;
PlacementList list;
} PlaceMap[65536];

#define MALLOC(TYPE) (TYPE*)malloc(sizeof(TYPE))
#define BIGRAM(p) ((uint)(((*b)&0xff)*256 + ((*(b+1))&0xff)))

extern PlacementOptionsList *bigram_places(char *text,char *line);
extern void PlacementOptionsList_free(PlacementOptionsList *list);

#endif

+ 18
- 0
List.h View File

@@ -0,0 +1,18 @@
#ifndef List_H
#define List_H

typedef struct _IListItem {
struct _IListItem *next;
} IListItem;

typedef struct _List {
IListItem *first;
IListItem *last;
} List;

extern void List_append(List *list,IListItem *item);
extern void List_prepend(List *list,IListItem *item);
extern void List_insert(
List *list,IListItem *item,int (*fn)(IListItem *a,IListItem *b) );

#endif

+ 7
- 0
Makefile View File

@@ -0,0 +1,7 @@
CFLAGS = -g -Wall

bigrep: bigrep.c bigram.c list.c Bigram.h List.h
$(CC) $(CFLAGS) $^ -o $@

clean:
rm -f bigrep

+ 14
- 0
README.adoc View File

@@ -0,0 +1,14 @@
bigrep
======

The is a flexible text search tool.

bigrep [ options ] text directories-and-files
=> count:file:line: text-with-bigram-highlights

options:
-d N cost of bigram drop
-s N cost of bigram separation
-p N required percentage of bigram matches
-F flow matching = treating newline as space


+ 148
- 0
bigram.c View File

@@ -0,0 +1,148 @@
/**
Implements bigram matching.
*/
#include <stdlib.h>
#include <stdio.h>
#include "Bigram.h"

int drop_cost = 1; // Cost of dropping another bigram
int space_cost = 0; // Cost of dropping a bigram starting with space
int displace_cost = 1; // Cost of displacing bigram other than the first
int threshold_cost = 20; // Threshold for keeping option

static Placement *Placement_create(int i,int j) {
Placement *p = MALLOC(Placement);
p->link.next = 0;
p->i = i;
p->j = j;
return p;
}

static Placement *Placement_copy(Placement *in) {
PlacementList list = { 0, 0 };
for ( ; in; in = in->next ) {
List_append( &list, (IListItem*) Placement_create( in->i, in->j ) );
}
return (Placement*) list.first;
}

static Placement *Placement_rcopy(Placement *in) {
PlacementList list = { 0, 0 };
for ( ; in; in = in->next ) {
List_prepend( &list, (IListItem*) Placement_create( in->i, in->j ) );
}
return (Placement*) list.first;
}

static PlacementOptions *PlacementOptions_create(Placement *options,int rev) {
PlacementOptions *p = MALLOC(PlacementOptions);
p->link.next = 0;
p->places = rev? Placement_rcopy( options ) : Placement_copy( options );
p->value = 0;
return p;
}

static PlacementOptionsList *PlacementOptionsList_create() {
PlacementOptionsList *p = MALLOC( List );
p->first = 0;
p->last = 0;
return p;
}

static void Placement_free(Placement *list) {
Placement *p;
while ( ( p = list ) ) {
list = p->next;
free( p );
}
}

void PlacementOptionsList_free(PlacementOptionsList *list) {
IListItem *p;
while ( ( p = list->first ) ) {
list->first = p->next;
Placement_free( ((PlacementOptions*)p)->places );
free( p );
}
free( list );
}

static int bigram_worse(IListItem *a,IListItem *b) {
PlacementOptions *ap = (PlacementOptions*)a;
PlacementOptions *bp = (PlacementOptions*)b;
return ap->value < bp->value;
}

static void bigram_traverse_options(
PlacementOptionsList *out,PlacementOptions *bigrams,
int cost,int at,Placement *prev ) {
if ( bigrams ) {
Placement *p;
int c = ( bigrams->value / 256 ) == ' ';
c |= ( bigrams->value & 0xff ) == ' ';
c = c? space_cost : drop_cost;
bigram_traverse_options( out, bigrams->next, cost + c, at, prev );
c = displace_cost;
for ( p = bigrams->places; p; p = p->next ) {
if ( p->j > at ) {
Placement x = { .next = prev, .i = p->i, .j = p->j };
bigram_traverse_options(
out, bigrams->next,
cost + ((at < 0)? 0 : ( (p->j - at - 1) * c )),
p->j, &x );
}
}
} else {
// capture a reversed copy of current placement path at the
// given cost
if ( cost < threshold_cost ) {
PlacementOptions *op = PlacementOptions_create( prev, 1 );
op->value = cost;
List_insert( out, (IListItem*) op, bigram_worse );
}
}
}

/**
Determine all the placements of the bigrams in 'text' into 'line'.
Returns an ordered list of the placement options by ascending
"cost".

Each PlacementOptions record comprises the cost and the chain of
placements of all or some of the 'text' bigrams in the line, by
order of increasing byte position index. Each such placement record
has the bigram as its 'i' field (256*c1+c2) and the position as its
'j' field.
*/
PlacementOptionsList *bigram_places(char *text,char *line) {
PlaceMap map = { 0 };
char *b;
int prev = 0;
for ( b = line; *b; b++ ) {
uint i = BIGRAM( b );
if ( map[ i ].prev == 0 ) {
map[ i ].prev = -1 - prev;
prev = i;
}
List_append( (List*) &map[ i ].list,
(IListItem*) Placement_create( i, b - line ) );
}
PlacementOptionsList *places = PlacementOptionsList_create();
for ( b = text; *b && *(b+1); b++ ) {
uint i = BIGRAM( b );
Placement *p = (Placement*) map[ i ].list.first;
PlacementOptions *po = PlacementOptions_create( p, 0 );
po->value = i; // 'value' is bigram for traversal
List_append( places, (IListItem*) po );

}
while ( prev ) {
Placement_free( (Placement*) map[ prev ].list.first );
prev = -1 - map[ prev ].prev;
}
PlacementOptionsList *out = PlacementOptionsList_create();
bigram_traverse_options( out, (PlacementOptions*)places->first, 0, -1, 0 );
PlacementOptionsList_free( places );
return out;
}

+ 88
- 0
bigrep.c View File

@@ -0,0 +1,88 @@
/**
* Apply bigram matching to text files, or stdin
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "Bigram.h"

static void usage() {
fprintf( stderr, "Usage: [ --cost=dr:sp:di:th ] text files...\n" );
exit( 1 );
}

/**
* tell all placement options.
*/
#if 0
static void tell(PlacementOptions *places,int at,int left) {
Placement *op;
for ( ; places ; places = places->next ) {
fprintf( stdout, "= %d:", places->value );
for ( op = places->places ; op; op = op->next ) {
char c1 = op->i / 256;
char c2 = op->i & 0xFF;
fprintf( stdout, " %c%c:%d", c1, c2, op->j );
}
fprintf( stdout, "\n" );
break; // only tell first=best placement option
}
}
#endif

static void process(char *text,char *pathname) {
FILE *file = fopen( pathname, "r" );
if ( file ) {
//fprintf( stderr, "processing %s\n", pathname );
size_t sz = 0;
char *line = 0;
int lineno = 1;
while ( ( getline( &line, &sz, file ) ) >= 0 ) {
PlacementOptionsList *places = bigram_places( text, line );
if ( places && places->first && places->first ) {
PlacementOptions *p = (PlacementOptions*) places->first;
if ( p->places ) {
fprintf( stdout, "%s:%s", pathname, line );
#if 0
tell( (PlacementOptions*) places->first, -1, 0, line );
#endif
PlacementOptionsList_free( places );
}
}
free( line );
line = 0;
sz = 0;
lineno++;
}
}
}

#define COSTOPT "--cost="

extern int drop_cost; // Cost of dropping another bigram
extern int space_cost; // Cost of dropping a bigram starting with space
extern int displace_cost; // Cost of displacing bigram other than the first
extern int threshold_cost; // Threshold for keeping option

int main(int argc,char **argv) {
if ( argc < 2 ) {
usage(); // exits
}
int i = 1;
char *text;
if ( strncmp( argv[i], COSTOPT, strlen( COSTOPT ) ) == 0 ) {
if ( sscanf( argv[i] + strlen( COSTOPT ), "%d:%d:%d:%d",
&drop_cost, &space_cost, &displace_cost,
&threshold_cost ) != 4 ) {
usage();
}
i++;
}
text = argv[i++];
for ( ; i < argc; i ++ ) {
process( text, argv[i] );
}
return 0;
}

+ 41
- 0
list.c View File

@@ -0,0 +1,41 @@
#include "List.h"

void List_append(List *list,IListItem *item) {
item->next = 0;
if ( list->last ) {
list->last->next = item;
} else {
list->first = item;
}
list->last = item;
}

void List_prepend(List *list,IListItem *item) {
item->next = list->first;
list->first = item;
if ( list->last == 0 ) {
list->last = item;
}
}

void List_insert(
List *list,IListItem *item,int (*fn)(IListItem *a,IListItem *b) ) {
IListItem *x;
IListItem *y = 0;
for ( x = list->first; x; y = x, x = x->next ) {
if ( fn( item, x ) ) {
break;
}
}
item->next = x;
if ( x ) {
if ( y ) {
y->next = item;
} else {
list->first = item;
}
} else {
list->first = item;
list->last = item;
}
}

Loading…
Cancel
Save