uk.ac.gla.dcs.renaissance.util
Class WarcRecord

java.lang.Object
  extended by uk.ac.gla.dcs.renaissance.util.WarcRecord

public class WarcRecord
extends Object


Nested Class Summary
 class WarcRecord.WarcHeader
          Warc header class
 
Field Summary
static String WARC_VERSION
           
static String WARC_VERSION_LINE
           
 
Constructor Summary
WarcRecord()
          Default Constructor
WarcRecord(WarcRecord o)
          Copy Constructor
 
Method Summary
 void addHeaderMetadata(String key, String value)
          Adds a key/value pair to a WARC header.
 void clearHeaderMetadata()
          Clears all metadata items from a header
 byte[] getContent()
          Restrieves the byte content for this record
 String getContentUTF8()
          Retrieves the bytes content as a UTF-8 string
 Set<Map.Entry<String,String>> getHeaderMetadata()
          Gets the set of metadata items from the header
 String getHeaderMetadataItem(String key)
          Gets a value for a specific header metadata key
 String getHeaderRecordType()
          Gets the header record type string
 String getHeaderString()
          Gets the WARC header as a string
 long getStartMarker()
          Gets the start marker, the position in the stream where the WARC record starts.
 long getStopMarker()
          Gets the stop marker, the position in the stream where the WARC record end.
 int getStopMarkerDiff()
          Gets the difference between the start and stop marker
 int getTotalRecordLength()
          Retrieves the total record length (header and content)
 String getWarcFilePath()
          Gets the file path from this WARC file (if set)
static void main(String[] args)
          Use for testing purposes
static void newFile()
          Sets the current position pointer to 0.
static boolean readContent(boolean readC)
          Set the flag that controls if we read or drop the content we read from the stream.
 void readFields(DataInput in)
          Serialization input
static WarcRecord readNextWarcRecord(DataInputStream in)
          Reads in a WARC record from a data input stream
 void set(WarcRecord o)
          Sets the record content (copy)
 void setContent(byte[] content)
          Sets the byte content for this record
 void setContent(String content)
          Sets the byte content for this record
 void setStartMarker(long startMarker)
          Sets the start marker, the position in the stream where the WARC record starts.
 void setStopMarkerDiff(int stopMarkerDiff)
          Sets the stop marker difference, the difference between the position in the stream where the WARC record starts and where it ends.
 void setWarcContentType(String contentType)
          Sets the content type string
 void setWarcDate(String dateString)
          Sets the WARC header date string
 void setWarcFilePath(String path)
          Sets the warc file path (optional - for use with getWarcFilePath)
 void setWarcRecordType(String recordType)
          Sets the record type string
 void setWarcUUID(String UUID)
          Sets the WARC uuid string
 String toString()
           
 void write(DataOutput out)
          Serialization output
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
 

Field Detail

WARC_VERSION

public static String WARC_VERSION

WARC_VERSION_LINE

public static String WARC_VERSION_LINE
Constructor Detail

WarcRecord

public WarcRecord()
Default Constructor


WarcRecord

public WarcRecord(WarcRecord o)
Copy Constructor

Parameters:
o -
Method Detail

readContent

public static boolean readContent(boolean readC)
Set the flag that controls if we read or drop the content we read from the stream. If this is set to false, no content will be read (just some important headers), for instance if we are just interested in the recod boundaries in a stream (e.g. when working with MG4J). This has consequences on what is returned with, e.g., readNextWarcRecord(DataInputStream) Content is read by default.

Parameters:
readC - true if content should be read, false otherwise.
Returns:
the previous state of the readContent flag

newFile

public static void newFile()
Sets the current position pointer to 0. Invoke this before parsing a new WARC file.


setStartMarker

public void setStartMarker(long startMarker)
Sets the start marker, the position in the stream where the WARC record starts.

Parameters:
startMarker - the start marker or -1 if this value is undefined

setStopMarkerDiff

public void setStopMarkerDiff(int stopMarkerDiff)
Sets the stop marker difference, the difference between the position in the stream where the WARC record starts and where it ends.

Parameters:
stopMarkerDiff -

getStopMarkerDiff

public int getStopMarkerDiff()
Gets the difference between the start and stop marker

Returns:
the difference between start and stop marker or -1 if this value is undefined

getStopMarker

public long getStopMarker()
Gets the stop marker, the position in the stream where the WARC record end.

Returns:
the stop marker or -1 if this value is undefined

getStartMarker

public long getStartMarker()
Gets the start marker, the position in the stream where the WARC record starts.

Returns:
the start marker

readNextWarcRecord

public static WarcRecord readNextWarcRecord(DataInputStream in)
                                     throws IOException
Reads in a WARC record from a data input stream

Parameters:
in - the input stream
Returns:
a WARC record (or null if eof)
Throws:
IOException

getTotalRecordLength

public int getTotalRecordLength()
Retrieves the total record length (header and content)

Returns:
total record length

set

public void set(WarcRecord o)
Sets the record content (copy)

Parameters:
o - record to copy from

getWarcFilePath

public String getWarcFilePath()
Gets the file path from this WARC file (if set)

Returns:

setWarcFilePath

public void setWarcFilePath(String path)
Sets the warc file path (optional - for use with getWarcFilePath)

Parameters:
path -

setWarcRecordType

public void setWarcRecordType(String recordType)
Sets the record type string

Parameters:
recordType -

setWarcContentType

public void setWarcContentType(String contentType)
Sets the content type string

Parameters:
contentType -

setWarcDate

public void setWarcDate(String dateString)
Sets the WARC header date string

Parameters:
dateString -

setWarcUUID

public void setWarcUUID(String UUID)
Sets the WARC uuid string

Parameters:
UUID -

addHeaderMetadata

public void addHeaderMetadata(String key,
                              String value)
Adds a key/value pair to a WARC header. This is needed to filter out known keys

Parameters:
key -
value -

clearHeaderMetadata

public void clearHeaderMetadata()
Clears all metadata items from a header


getHeaderMetadata

public Set<Map.Entry<String,String>> getHeaderMetadata()
Gets the set of metadata items from the header

Returns:

getHeaderMetadataItem

public String getHeaderMetadataItem(String key)
Gets a value for a specific header metadata key

Parameters:
key -
Returns:

setContent

public void setContent(byte[] content)
Sets the byte content for this record

Parameters:
content -

setContent

public void setContent(String content)
Sets the byte content for this record

Parameters:
content -

getContent

public byte[] getContent()
Restrieves the byte content for this record

Returns:

getContentUTF8

public String getContentUTF8()
Retrieves the bytes content as a UTF-8 string

Returns:

getHeaderRecordType

public String getHeaderRecordType()
Gets the header record type string

Returns:

toString

public String toString()
Overrides:
toString in class Object

getHeaderString

public String getHeaderString()
Gets the WARC header as a string

Returns:

write

public void write(DataOutput out)
           throws IOException
Serialization output

Parameters:
out -
Throws:
IOException

readFields

public void readFields(DataInput in)
                throws IOException
Serialization input

Parameters:
in -
Throws:
IOException

main

public static void main(String[] args)
                 throws IOException
Use for testing purposes

Parameters:
args -
Throws:
IOException


Copyright © 2011. All Rights Reserved.