1919
2020package org .archive .format .warc ;
2121
22- import java .util .Arrays ;
23- import java .util .List ;
22+ import org .archive .format .ArchiveFileConstants ;
2423
2524/**
2625 * WARC Constants used by WARC readers and writers.
2726 *
2827 * @contributor stack
2928 */
30- public interface WARCConstants {
31- public static final String DEFAULT_DIGEST_METHOD = "SHA-1" ;
32-
29+ public interface WARCConstants extends ArchiveFileConstants {
3330 /**
3431 * Default maximum WARC file size.
3532 * 1Gig.
@@ -70,11 +67,6 @@ public interface WARCConstants {
7067 */
7168 public static final String DOT_WARC_FILE_EXTENSION =
7269 "." + WARC_FILE_EXTENSION ;
73-
74- /**
75- * Dot plus compressed file extention.
76- */
77- public static final String DOT_COMPRESSED_FILE_EXTENSION = ".gz" ;
7870
7971 /**
8072 * Compressed WARC file extension.
@@ -107,87 +99,23 @@ public interface WARCConstants {
10799 // TODO: Revisit. 8859 isn't correct, especially if we settle on RFC822
108100 // headers
109101 public static final String WARC_HEADER_ENCODING = HEADER_LINE_ENCODING ;
110-
111-
112- /**
113- * Key for the Archive File version field.
114- */
115- public static final String VERSION_FIELD_KEY = "version" ;
116-
117- /**
118- * Key for the Archive File length field.
119- */
120- public static final String LENGTH_FIELD_KEY = "length" ;
121-
122- /**
123- * Key for the Archive File type field.
124- */
125- public static final String TYPE_FIELD_KEY = "type" ;
126-
127- /**
128- * Key for the Archive File URL field.
129- */
130- public static final String URL_FIELD_KEY = "subject-uri" ;
131102
132103 /**
133- * Key for the Archive File Creation Date field.
134- */
135- public static final String DATE_FIELD_KEY = "creation-date" ;
136-
137- /**
138- * Key for the Archive File mimetype field.
104+ * WARC Record Types. These names need to match the literal string values.
139105 */
140- public static final String MIMETYPE_FIELD_KEY = "content-type" ;
141-
142- /**
143- * Key for the Archive File record field.
144- */
145- public static final String RECORD_IDENTIFIER_FIELD_KEY =
146- "record-identifier" ;
147-
148-
149-
150- public static final String [] HEADER_FIELD_KEYS = {
151- VERSION_FIELD_KEY ,
152- LENGTH_FIELD_KEY ,
153- TYPE_FIELD_KEY ,
154- URL_FIELD_KEY ,
155- DATE_FIELD_KEY ,
156- RECORD_IDENTIFIER_FIELD_KEY ,
157- MIMETYPE_FIELD_KEY
158- };
159-
160- /**
161- * WARC Record Types.
162- */
163- public static final String WARCINFO = "warcinfo" ;
164- public static final String RESPONSE = "response" ;
165- public static final String RESOURCE = "resource" ;
166- public static final String REQUEST = "request" ;
167- public static final String METADATA = "metadata" ;
168- public static final String REVISIT = "revisit" ;
169- public static final String CONVERSION = "conversion" ;
170- public static final String CONTINUATION = "continuation" ;
106+ enum WARCRecordType {
107+ warcinfo ,
108+ response ,
109+ resource ,
110+ request ,
111+ metadata ,
112+ revisit ,
113+ conversion ,
114+ continuation
115+ }
171116
172117 public static final String TYPE = "type" ;
173118
174- // List of all WARC Record TYPES
175- public static final String [] TYPES = {WARCINFO , RESPONSE , RESOURCE ,
176- REQUEST , METADATA , REVISIT , CONVERSION , CONTINUATION };
177-
178- // Indices into TYPES array.
179- public static final int WARCINFO_INDEX = 0 ;
180- public static final int RESPONSE_INDEX = 1 ;
181- public static final int RESOURCE_INDEX = 2 ;
182- public static final int REQUEST_INDEX = 3 ;
183- public static final int METADATA_INDEX = 4 ;
184- public static final int REVISIT_INDEX = 5 ;
185- public static final int CONVERSION_INDEX = 6 ;
186- public static final int CONTINUATION_INDEX = 7 ;
187-
188- // TYPES as List.
189- public static final List <String > TYPES_LIST = Arrays .asList (TYPES );
190-
191119 /**
192120 * WARC-ID
193121 */
@@ -237,14 +165,26 @@ public interface WARCConstants {
237165 public static final String HEADER_KEY_PAYLOAD_DIGEST = "WARC-Payload-Digest" ;
238166 public static final String HEADER_KEY_CONCURRENT_TO =
239167 "WARC-Concurrent-To" ;
240- public static final String HEADER_KEY_REFERS_TO =
241- "WARC-Refers-To" ;
242168 public static final String HEADER_KEY_TRUNCATED = "WARC-Truncated" ;
243169 public static final String HEADER_KEY_PROFILE = "WARC-Profile" ;
244170 public static final String HEADER_KEY_FILENAME = "WARC-Filename" ;
245171 public static final String HEADER_KEY_ETAG = "WARC-Etag" ;
246172 public static final String HEADER_KEY_LAST_MODIFIED = "WARC-Last-Modified" ;
173+ public static final String HEADER_KEY_REFERS_TO = "WARC-Refers-To" ;
174+
175+ /**
176+ * These fields help a consumer of the warc to locate the warc record that
177+ * {@value #HEADER_KEY_REFERS_TO} refers to.
178+ *
179+ * @see WARCWriterProcessor
180+ */
181+ public static final String HEADER_KEY_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI" ;
182+ public static final String HEADER_KEY_REFERS_TO_DATE = "WARC-Refers-To-Date" ;
183+ public static final String HEADER_KEY_REFERS_TO_FILENAME = "WARC-Refers-To-Filename" ;
184+ public static final String HEADER_KEY_REFERS_TO_FILE_OFFSET = "WARC-Refers-To-File-Offset" ;
247185
186+ public static final String PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST =
187+ "http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest" ;
248188 public static final String PROFILE_REVISIT_IDENTICAL_DIGEST =
249189 "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest" ;
250190 public static final String PROFILE_REVISIT_NOT_MODIFIED =
@@ -257,7 +197,7 @@ public interface WARCConstants {
257197 public static final String COLON_SPACE = ": " ;
258198
259199 public static final String TRUNCATED_VALUE_UNSPECIFIED = "unspecified" ;
260- public static final String WARC_FIELDS_TYPE = "application/warc-fields" ;
200+
261201
262202 /**
263203 * To be safe, lets use application type rather than message. Regards
@@ -274,4 +214,6 @@ public interface WARCConstants {
274214
275215 public static final String FTP_CONTROL_CONVERSATION_MIMETYPE =
276216 "text/x-ftp-control-conversation" ;
217+
218+ public static final String WARC_FIELDS_TYPE = "application/warc-fields" ;
277219}
0 commit comments