|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object uk.ac.gla.dcs.renaissance.util.WarcRecord
public class WarcRecord
Nested Class Summary | |
---|---|
class |
WarcRecord.WarcHeader
Warc header class |
Field Summary | |
---|---|
static String |
WARC_VERSION
|
static String |
WARC_VERSION_LINE
|
Constructor Summary | |
---|---|
WarcRecord()
Default Constructor |
|
WarcRecord(WarcRecord o)
Copy Constructor |
Method Summary | |
---|---|
void |
addHeaderMetadata(String key,
String value)
Adds a key/value pair to a WARC header. |
void |
clearHeaderMetadata()
Clears all metadata items from a header |
byte[] |
getContent()
Restrieves the byte content for this record |
String |
getContentUTF8()
Retrieves the bytes content as a UTF-8 string |
Set<Map.Entry<String,String>> |
getHeaderMetadata()
Gets the set of metadata items from the header |
String |
getHeaderMetadataItem(String key)
Gets a value for a specific header metadata key |
String |
getHeaderRecordType()
Gets the header record type string |
String |
getHeaderString()
Gets the WARC header as a string |
long |
getStartMarker()
Gets the start marker, the position in the stream where the WARC record starts. |
long |
getStopMarker()
Gets the stop marker, the position in the stream where the WARC record end. |
int |
getStopMarkerDiff()
Gets the difference between the start and stop marker |
int |
getTotalRecordLength()
Retrieves the total record length (header and content) |
String |
getWarcFilePath()
Gets the file path from this WARC file (if set) |
static void |
main(String[] args)
Use for testing purposes |
static void |
newFile()
Sets the current position pointer to 0. |
static boolean |
readContent(boolean readC)
Set the flag that controls if we read or drop the content we read from the stream. |
void |
readFields(DataInput in)
Serialization input |
static WarcRecord |
readNextWarcRecord(DataInputStream in)
Reads in a WARC record from a data input stream |
void |
set(WarcRecord o)
Sets the record content (copy) |
void |
setContent(byte[] content)
Sets the byte content for this record |
void |
setContent(String content)
Sets the byte content for this record |
void |
setStartMarker(long startMarker)
Sets the start marker, the position in the stream where the WARC record starts. |
void |
setStopMarkerDiff(int stopMarkerDiff)
Sets the stop marker difference, the difference between the position in the stream where the WARC record starts and where it ends. |
void |
setWarcContentType(String contentType)
Sets the content type string |
void |
setWarcDate(String dateString)
Sets the WARC header date string |
void |
setWarcFilePath(String path)
Sets the warc file path (optional - for use with getWarcFilePath) |
void |
setWarcRecordType(String recordType)
Sets the record type string |
void |
setWarcUUID(String UUID)
Sets the WARC uuid string |
String |
toString()
|
void |
write(DataOutput out)
Serialization output |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait |
Field Detail |
---|
public static String WARC_VERSION
public static String WARC_VERSION_LINE
Constructor Detail |
---|
public WarcRecord()
public WarcRecord(WarcRecord o)
o
- Method Detail |
---|
public static boolean readContent(boolean readC)
false
, no content will be read
(just some important headers), for instance if we are just interested in
the recod boundaries in a stream (e.g. when working with MG4J). This has
consequences on what is returned with, e.g.,
readNextWarcRecord(DataInputStream)
Content is read by default.
readC
- true
if content should be read,
false
otherwise.
public static void newFile()
public void setStartMarker(long startMarker)
startMarker
- the start marker or -1 if this value is undefinedpublic void setStopMarkerDiff(int stopMarkerDiff)
stopMarkerDiff
- public int getStopMarkerDiff()
public long getStopMarker()
public long getStartMarker()
public static WarcRecord readNextWarcRecord(DataInputStream in) throws IOException
in
- the input stream
IOException
public int getTotalRecordLength()
public void set(WarcRecord o)
o
- record to copy frompublic String getWarcFilePath()
public void setWarcFilePath(String path)
path
- public void setWarcRecordType(String recordType)
recordType
- public void setWarcContentType(String contentType)
contentType
- public void setWarcDate(String dateString)
dateString
- public void setWarcUUID(String UUID)
UUID
- public void addHeaderMetadata(String key, String value)
key
- value
- public void clearHeaderMetadata()
public Set<Map.Entry<String,String>> getHeaderMetadata()
public String getHeaderMetadataItem(String key)
key
-
public void setContent(byte[] content)
content
- public void setContent(String content)
content
- public byte[] getContent()
public String getContentUTF8()
public String getHeaderRecordType()
public String toString()
toString
in class Object
public String getHeaderString()
public void write(DataOutput out) throws IOException
out
-
IOException
public void readFields(DataInput in) throws IOException
in
-
IOException
public static void main(String[] args) throws IOException
args
-
IOException
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |