Skip to content

Commit b0dfd08

Browse files
committed
consolidate disparate versions of WARCConstants into one version that lives in ia-web-commons
1 parent 06b75b5 commit b0dfd08

4 files changed

Lines changed: 149 additions & 99 deletions

File tree

src/main/java/org/archive/extract/ExtractingResourceFactoryMapper.java

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import org.archive.format.arc.ARCConstants;
77
import org.archive.format.warc.WARCConstants;
8+
import org.archive.format.warc.WARCConstants.WARCRecordType;
89
import org.archive.resource.MetaData;
910
import org.archive.resource.Resource;
1011
import org.archive.resource.ResourceFactory;
@@ -130,24 +131,24 @@ private boolean isHTMLHttpResource(MetaData m) {
130131
return type == null ? false : type.toLowerCase().contains("html");
131132
}
132133

133-
private boolean isWARCType(MetaData envelope, String type) {
134+
private boolean isWARCType(MetaData envelope, WARCRecordType type) {
134135
return childFieldEquals(envelope,WARC_HEADER_METADATA,
135-
WARCConstants.HEADER_KEY_TYPE,type);
136+
WARCConstants.HEADER_KEY_TYPE,type.toString());
136137
}
137138
private boolean isWARCRevisitResource(MetaData envelope) {
138-
return isWARCType(envelope, WARCConstants.REVISIT);
139+
return isWARCType(envelope, WARCRecordType.revisit);
139140
}
140141
private boolean isWARCResponseResource(MetaData envelope) {
141-
return isWARCType(envelope, WARCConstants.RESPONSE);
142+
return isWARCType(envelope, WARCRecordType.response);
142143
}
143144
private boolean isWARCRequestResource(MetaData envelope) {
144-
return isWARCType(envelope, WARCConstants.REQUEST);
145+
return isWARCType(envelope, WARCRecordType.request);
145146
}
146147
private boolean isWARCMetaDataResource(MetaData envelope) {
147-
return isWARCType(envelope, WARCConstants.METADATA);
148+
return isWARCType(envelope, WARCRecordType.metadata);
148149
}
149150
private boolean isWARCInfoResource(MetaData envelope) {
150-
return isWARCType(envelope, WARCConstants.WARCINFO);
151+
return isWARCType(envelope, WARCRecordType.warcinfo);
151152
}
152153
private boolean isHTTPResponseWARCResource(MetaData envelope) {
153154
return childFieldEquals(envelope,WARC_HEADER_METADATA,
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/*
2+
* This file is part of the Heritrix web crawler (crawler.archive.org).
3+
*
4+
* Licensed to the Internet Archive (IA) by one or more individual
5+
* contributors.
6+
*
7+
* The IA licenses this file to You under the Apache License, Version 2.0
8+
* (the "License"); you may not use this file except in compliance with
9+
* the License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS,
15+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
* See the License for the specific language governing permissions and
17+
* limitations under the License.
18+
*/
19+
20+
package org.archive.format;
21+
22+
/**
23+
* Constants used by Archive files and in Archive file processing.
24+
* @author stack
25+
* @version $Date$ $Revision$
26+
*/
27+
public interface ArchiveFileConstants {
28+
/**
29+
* Suffix given to files currently in use.
30+
*/
31+
public static final String OCCUPIED_SUFFIX = ".open";
32+
33+
/**
34+
* Suffix appended to 'broken' files.
35+
*/
36+
public static final String INVALID_SUFFIX = ".invalid";
37+
38+
/**
39+
* Dot plus compressed file extention.
40+
*/
41+
public static final String DOT_COMPRESSED_FILE_EXTENSION = ".gz";
42+
43+
/**
44+
* Key for the Archive File version field.
45+
*/
46+
public static final String VERSION_FIELD_KEY = "version";
47+
48+
/**
49+
* Key for the Archive File length field.
50+
*/
51+
public static final String LENGTH_FIELD_KEY = "length";
52+
53+
/**
54+
* Key for the Archive File type field.
55+
*/
56+
public static final String TYPE_FIELD_KEY = "type";
57+
58+
/**
59+
* Key for the Archive File URL field.
60+
*/
61+
public static final String URL_FIELD_KEY = "subject-uri";
62+
63+
/**
64+
* Key for the Archive File Creation Date field.
65+
*/
66+
public static final String DATE_FIELD_KEY = "creation-date";
67+
68+
/**
69+
* Key for the Archive File mimetype field.
70+
*/
71+
public static final String MIMETYPE_FIELD_KEY = "content-type";
72+
73+
/**
74+
* Key for the Archive File record field.
75+
*/
76+
public static final String RECORD_IDENTIFIER_FIELD_KEY =
77+
"record-identifier";
78+
79+
/**
80+
* Key for the Archive Record absolute offset into Archive file.
81+
*/
82+
public static final String ABSOLUTE_OFFSET_KEY = "absolute-offset";
83+
84+
public static final String READER_IDENTIFIER_FIELD_KEY =
85+
"reader-identifier";
86+
87+
/**
88+
* Size used to preallocate stringbuffer used outputting a cdx line.
89+
* The numbers below are guesses at sizes of each of the cdx fields.
90+
* The ones in the below are spaces. Here is the legend used outputting
91+
* the cdx line: CDX b e a m s c V n g. Consult cdx documentation on
92+
* meaning of each of these fields.
93+
*/
94+
public static final int CDX_LINE_BUFFER_SIZE = 14 + 1 + 15 + 1 + 1024 +
95+
1 + 24 + 1 + + 3 + 1 + 32 + 1 + 20 + 1 + 20 + 1 + 64;
96+
97+
public static final String DEFAULT_DIGEST_METHOD = "SHA-1";
98+
99+
public static final char SINGLE_SPACE = ' ';
100+
101+
public static final String CRLF = "\r\n";
102+
103+
public static final String CDX = "cdx";
104+
public static final String DUMP = "dump";
105+
public static final String GZIP_DUMP = "gzipdump";
106+
public static final String HEADER = "header";
107+
public static final String NOHEAD = "nohead";
108+
public static final String CDX_FILE = "cdxfile";
109+
}

src/main/java/org/archive/format/warc/WARCConstants.java

Lines changed: 30 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,14 @@
1919

2020
package org.archive.format.warc;
2121

22-
import java.util.Arrays;
23-
import java.util.List;
22+
import org.archive.format.ArchiveFileConstants;
2423

2524
/**
2625
* WARC Constants used by WARC readers and writers.
2726
*
2827
* @contributor stack
2928
*/
30-
public interface WARCConstants {
31-
public static final String DEFAULT_DIGEST_METHOD = "SHA-1";
32-
29+
public interface WARCConstants extends ArchiveFileConstants {
3330
/**
3431
* Default maximum WARC file size.
3532
* 1Gig.
@@ -70,11 +67,6 @@ public interface WARCConstants {
7067
*/
7168
public static final String DOT_WARC_FILE_EXTENSION =
7269
"." + WARC_FILE_EXTENSION;
73-
74-
/**
75-
* Dot plus compressed file extention.
76-
*/
77-
public static final String DOT_COMPRESSED_FILE_EXTENSION = ".gz";
7870

7971
/**
8072
* Compressed WARC file extension.
@@ -107,87 +99,23 @@ public interface WARCConstants {
10799
// TODO: Revisit. 8859 isn't correct, especially if we settle on RFC822
108100
// headers
109101
public static final String WARC_HEADER_ENCODING = HEADER_LINE_ENCODING;
110-
111-
112-
/**
113-
* Key for the Archive File version field.
114-
*/
115-
public static final String VERSION_FIELD_KEY = "version";
116-
117-
/**
118-
* Key for the Archive File length field.
119-
*/
120-
public static final String LENGTH_FIELD_KEY = "length";
121-
122-
/**
123-
* Key for the Archive File type field.
124-
*/
125-
public static final String TYPE_FIELD_KEY = "type";
126-
127-
/**
128-
* Key for the Archive File URL field.
129-
*/
130-
public static final String URL_FIELD_KEY = "subject-uri";
131102

132103
/**
133-
* Key for the Archive File Creation Date field.
134-
*/
135-
public static final String DATE_FIELD_KEY = "creation-date";
136-
137-
/**
138-
* Key for the Archive File mimetype field.
104+
* WARC Record Types. These names need to match the literal string values.
139105
*/
140-
public static final String MIMETYPE_FIELD_KEY = "content-type";
141-
142-
/**
143-
* Key for the Archive File record field.
144-
*/
145-
public static final String RECORD_IDENTIFIER_FIELD_KEY =
146-
"record-identifier";
147-
148-
149-
150-
public static final String [] HEADER_FIELD_KEYS = {
151-
VERSION_FIELD_KEY,
152-
LENGTH_FIELD_KEY,
153-
TYPE_FIELD_KEY,
154-
URL_FIELD_KEY,
155-
DATE_FIELD_KEY,
156-
RECORD_IDENTIFIER_FIELD_KEY,
157-
MIMETYPE_FIELD_KEY
158-
};
159-
160-
/**
161-
* WARC Record Types.
162-
*/
163-
public static final String WARCINFO = "warcinfo";
164-
public static final String RESPONSE = "response";
165-
public static final String RESOURCE = "resource";
166-
public static final String REQUEST = "request";
167-
public static final String METADATA = "metadata";
168-
public static final String REVISIT = "revisit";
169-
public static final String CONVERSION = "conversion";
170-
public static final String CONTINUATION = "continuation";
106+
enum WARCRecordType {
107+
warcinfo,
108+
response,
109+
resource,
110+
request,
111+
metadata,
112+
revisit,
113+
conversion,
114+
continuation
115+
}
171116

172117
public static final String TYPE = "type";
173118

174-
// List of all WARC Record TYPES
175-
public static final String [] TYPES = {WARCINFO, RESPONSE, RESOURCE,
176-
REQUEST, METADATA, REVISIT, CONVERSION, CONTINUATION};
177-
178-
// Indices into TYPES array.
179-
public static final int WARCINFO_INDEX = 0;
180-
public static final int RESPONSE_INDEX = 1;
181-
public static final int RESOURCE_INDEX = 2;
182-
public static final int REQUEST_INDEX = 3;
183-
public static final int METADATA_INDEX = 4;
184-
public static final int REVISIT_INDEX = 5;
185-
public static final int CONVERSION_INDEX = 6;
186-
public static final int CONTINUATION_INDEX = 7;
187-
188-
// TYPES as List.
189-
public static final List<String> TYPES_LIST = Arrays.asList(TYPES);
190-
191119
/**
192120
* WARC-ID
193121
*/
@@ -237,14 +165,26 @@ public interface WARCConstants {
237165
public static final String HEADER_KEY_PAYLOAD_DIGEST = "WARC-Payload-Digest";
238166
public static final String HEADER_KEY_CONCURRENT_TO =
239167
"WARC-Concurrent-To";
240-
public static final String HEADER_KEY_REFERS_TO =
241-
"WARC-Refers-To";
242168
public static final String HEADER_KEY_TRUNCATED = "WARC-Truncated";
243169
public static final String HEADER_KEY_PROFILE = "WARC-Profile";
244170
public static final String HEADER_KEY_FILENAME = "WARC-Filename";
245171
public static final String HEADER_KEY_ETAG = "WARC-Etag";
246172
public static final String HEADER_KEY_LAST_MODIFIED = "WARC-Last-Modified";
173+
public static final String HEADER_KEY_REFERS_TO = "WARC-Refers-To";
174+
175+
/**
176+
* These fields help a consumer of the warc to locate the warc record that
177+
* {@value #HEADER_KEY_REFERS_TO} refers to.
178+
*
179+
* @see WARCWriterProcessor
180+
*/
181+
public static final String HEADER_KEY_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI";
182+
public static final String HEADER_KEY_REFERS_TO_DATE = "WARC-Refers-To-Date";
183+
public static final String HEADER_KEY_REFERS_TO_FILENAME = "WARC-Refers-To-Filename";
184+
public static final String HEADER_KEY_REFERS_TO_FILE_OFFSET = "WARC-Refers-To-File-Offset";
247185

186+
public static final String PROFILE_REVISIT_URI_AGNOSTIC_IDENTICAL_DIGEST =
187+
"http://netpreserve.org/warc/1.0/revisit/uri-agnostic-identical-payload-digest";
248188
public static final String PROFILE_REVISIT_IDENTICAL_DIGEST =
249189
"http://netpreserve.org/warc/1.0/revisit/identical-payload-digest";
250190
public static final String PROFILE_REVISIT_NOT_MODIFIED =
@@ -257,7 +197,7 @@ public interface WARCConstants {
257197
public static final String COLON_SPACE = ": ";
258198

259199
public static final String TRUNCATED_VALUE_UNSPECIFIED = "unspecified";
260-
public static final String WARC_FIELDS_TYPE = "application/warc-fields";
200+
261201

262202
/**
263203
* To be safe, lets use application type rather than message. Regards
@@ -274,4 +214,6 @@ public interface WARCConstants {
274214

275215
public static final String FTP_CONTROL_CONVERSATION_MIMETYPE =
276216
"text/x-ftp-control-conversation";
217+
218+
public static final String WARC_FIELDS_TYPE = "application/warc-fields";
277219
}

src/main/java/org/archive/format/warc/WARCRecordWriter.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
package org.archive.format.warc;
22

33
import java.io.IOException;
4-
import java.io.InputStream;
54
import java.io.OutputStream;
65
import java.util.Date;
76
import java.util.UUID;
87

98
import org.archive.format.http.HttpConstants;
109
import org.archive.format.http.HttpHeaders;
1110
import org.archive.util.DateUtils;
12-
import org.archive.util.StreamCopy;
1311

1412
public class WARCRecordWriter implements WARCConstants, HttpConstants
1513
{
@@ -74,7 +72,7 @@ public void writeWARCInfoRecord(OutputStream out,
7472
// Content-Length: 600
7573

7674
HttpHeaders headers = new HttpHeaders();
77-
headers.add(HEADER_KEY_TYPE, WARCINFO);
75+
headers.add(HEADER_KEY_TYPE, WARCRecordType.warcinfo.name());
7876
headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date());
7977
headers.add(HEADER_KEY_FILENAME, filename);
8078
headers.add(HEADER_KEY_ID, makeRecordId());
@@ -89,7 +87,7 @@ public void writeJSONMetadataRecord( OutputStream out,
8987
String origRecordId ) throws IOException
9088
{
9189
HttpHeaders headers = new HttpHeaders();
92-
headers.add(HEADER_KEY_TYPE, METADATA);
90+
headers.add(HEADER_KEY_TYPE, WARCRecordType.metadata.name());
9391
headers.add(HEADER_KEY_URI, targetURI);
9492
headers.add(HEADER_KEY_DATE, DateUtils.getLog14Date(originalDate));
9593
headers.add(HEADER_KEY_ID, makeRecordId());

0 commit comments

Comments
 (0)