Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dkpro-core-asl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
<module>../dkpro-core-io-json-asl</module>
<module>../dkpro-core-io-jdbc-asl</module>
<module>../dkpro-core-io-jwpl-asl</module>
<module>../dkpro-core-io-lcc-asl</module>
<module>../dkpro-core-io-lif-asl</module>
<module>../dkpro-core-io-lxf-asl</module>
<module>../dkpro-core-io-negra-asl</module>
Expand Down
9 changes: 7 additions & 2 deletions dkpro-core-io-lcc-asl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,27 @@
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-io-asl</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-resources-asl</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-segmentation-asl</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-parameter-asl</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-testing-asl</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,54 +42,49 @@

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;


/**
* Reader for sentence-based Leipzig Corpora Collection files.
*/
@ResourceMetaData(name = "Leipzig Corpora Collection Reader")
@MimeTypeCapability({MimeTypes.TEXT_X_LCC})
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" })
@MimeTypeCapability({ MimeTypes.TEXT_X_LCC })
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" })
public class LccReader
extends JCasResourceCollectionReader_ImplBase
{
/**
* Name of configuration parameter that contains the character encoding used by the input files.
*/
public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
@ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true,
defaultValue = ComponentParameters.DEFAULT_ENCODING)
@ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = ComponentParameters.DEFAULT_ENCODING)
private String sourceEncoding;

/**
* Whether sentences should be written by the reader or not.
*/
public static final String PARAM_WRITE_SENTENCE = ComponentParameters.PARAM_WRITE_SENTENCE;
@ConfigurationParameter(name = PARAM_WRITE_SENTENCE, mandatory = true, defaultValue = "false")
private boolean writeSentence;

/**
* How many input sentences should be merged into one CAS.
*/
public static final String PARAM_SENTENCES_PER_CAS = "sentencesPerCAS";
@ConfigurationParameter(name = PARAM_SENTENCES_PER_CAS, mandatory = true, defaultValue = "100")
private int sentencesPerCAS;

private Resource res;
private int casOffset;
private BufferedReader br;
private List<String> sentenceBuffer;

@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
public void initialize(UimaContext context) throws ResourceInitializationException
{
super.initialize(context);

casOffset = 0;
sentenceBuffer = new ArrayList<>();

// Seek first article
try {
step();
Expand All @@ -100,18 +95,18 @@ public void initialize(UimaContext context)
}

@Override
public boolean hasNext()
throws IOException, CollectionException
public boolean hasNext() throws IOException, CollectionException
{
// If there is still a buffer, then there is still data. This requires that we call
// step() already during initialization.
return !sentenceBuffer.isEmpty();
}

@Override
public void getNext(JCas aJCas) throws IOException, CollectionException {
public void getNext(JCas aJCas) throws IOException, CollectionException
{
initCas(aJCas, res, String.valueOf(casOffset));

StringBuilder sb = new StringBuilder();
int offset = 0;
for (String sentence : sentenceBuffer) {
Expand All @@ -125,32 +120,33 @@ public void getNext(JCas aJCas) throws IOException, CollectionException {
offset++;
}
aJCas.setDocumentText(sb.toString());

sentenceBuffer.clear();
casOffset++;
step();
}

// TODO find some way to properly estimate progress
@Override
public Progress[] getProgress() {
public Progress[] getProgress()
{
return new Progress[] { new ProgressImpl(casOffset, casOffset, "document") };
}

@Override
public void destroy()
{
closeAll();
super.destroy();
}

private void closeAll()
{
res = null;
closeQuietly(br);
br = null;
}

/**
* Seek article in file. Stop once article element has been found without reading it.
*/
Expand All @@ -171,24 +167,24 @@ private void step() throws IOException
return;
}
}

// Fill buffer
String line;
while (sentenceBuffer.size() < sentencesPerCAS && (line = br.readLine()) != null) {
String[] parts = line.split("\t");

if (parts.length != 2) {
throw new IOException("File not in LCC format: " + line);
}

sentenceBuffer.add(parts[1]);
}

// If buffer could be filled, return
if (!sentenceBuffer.isEmpty()) {
return;
}

// End of file reached
closeAll();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,81 +24,72 @@
import org.apache.uima.fit.pipeline.JCasIterable;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.junit.Test;
import org.junit.jupiter.api.Test;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;

public class LccReaderTest
{
@Test
public void testDefault()
throws Exception
public void testDefault() throws Exception
{
CollectionReaderDescription reader = createReaderDescription(
LccReader.class,
CollectionReaderDescription reader = createReaderDescription(LccReader.class,
LccReader.PARAM_SOURCE_LOCATION, "src/test/resources/text/sample.txt");

int i = 0;
for (JCas jcas : new JCasIterable(reader)) {
if (i == 0) {
assertEquals(3904, jcas.getDocumentText().length());
}
i++;
}

assertEquals(3, i);
}

@Test
public void testSmallBuffer()
throws Exception
public void testSmallBuffer() throws Exception
{
CollectionReaderDescription reader = createReaderDescription(
LccReader.class,
CollectionReaderDescription reader = createReaderDescription(LccReader.class,
LccReader.PARAM_SOURCE_LOCATION, "src/test/resources/text/sample.txt",
LccReader.PARAM_SENTENCES_PER_CAS, 2);

int i = 0;
for (JCas jcas : new JCasIterable(reader)) {
if (i == 0) {
assertEquals(91, jcas.getDocumentText().length());
}
i++;
}

assertEquals(120, i);
}

@Test
public void testBigBuffer()
throws Exception
public void testBigBuffer() throws Exception
{
CollectionReaderDescription reader = createReaderDescription(
LccReader.class,
CollectionReaderDescription reader = createReaderDescription(LccReader.class,
LccReader.PARAM_SOURCE_LOCATION, "src/test/resources/text/sample.txt",
LccReader.PARAM_SENTENCES_PER_CAS, 300);

int i = 0;
for (JCas jcas : new JCasIterable(reader)) {
if (i == 0) {
assertEquals(10579, jcas.getDocumentText().length());
}
i++;
}

assertEquals(1, i);
}

@Test
public void testSentenceWriting()
throws Exception
public void testSentenceWriting() throws Exception
{
CollectionReaderDescription reader = createReaderDescription(
LccReader.class,
CollectionReaderDescription reader = createReaderDescription(LccReader.class,
LccReader.PARAM_SOURCE_LOCATION, "src/test/resources/text/sample.txt",
LccReader.PARAM_SENTENCES_PER_CAS, 100,
LccReader.PARAM_WRITE_SENTENCE, true);

LccReader.PARAM_SENTENCES_PER_CAS, 100, LccReader.PARAM_WRITE_SENTENCE, true);

int i = 0;
for (JCas jcas : new JCasIterable(reader)) {
if (i == 2) {
Expand All @@ -109,7 +100,7 @@ public void testSentenceWriting()
}
i++;
}

assertEquals(3, i);
}
}
6 changes: 6 additions & 0 deletions dkpro-core-io-pubannotation-asl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,11 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-io-conll-asl</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Loading
Loading